title: "Cuculus Migratory Genomics"
author: "Justin Merondun"
date: "2025-02-14"
output:
html_document:
theme: journal
toc: true
toc_float:
collapsed: true
setwd('C:/Users/herit/Dropbox/CUCKOO_migration/Manuscript/Supplement/SupplementaryTables/')
library(tidyverse)
library(readxl)
library(meRo)
# Load the Excel file, skipping the first row
df <- read_excel("Supplementary_Tables.xlsx", skip = 1,sheet = 'S1') %>% as_tibble()
df %>% count(SpeciesShort)
df %>% mutate(Source = ifelse(Source == 'Blood','Blood',
ifelse(grepl('Feather',Source),'Feather',
'Muscle'))) %>%
count(Source)
summary(df$Raw_Bases_Gb)
sum(df$Raw_Bases_Gb)
sum(df$Raw_Reads_M)
summary(df$Mapped_Reads_M)
summary(df$MeanCoverage)
Business as usual; align and trim SRRs.
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=6
#SBATCH --mem-per-cpu=4763mb
#SBATCH --time=12:00:00
RUN=$1
wd=/dss/dsslegfs01/pr53da/pr53da-dss-0021/rawdata/WGS/ILLUMINA/
outdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/trimmed_fastq
RUN=$1
file1=$(ls ${wd}/${RUN}__R1__*)
file2=$(ls ${wd}/${RUN}__R2__*)
SCRATCH=/tmp/$SLURM_JOB_ID
#adapter trim
bbduk.sh t=6 -Xmx24g in=${file1} in2=${file2} out=${outdir}/${RUN}.trim.fastq.gz minlen=25 qtrim=rl trimq=2 ktrim=r k=23 mink=11 ref=~/modules/bbmap/adapters.fa hdist=1 tpe tbo
and submit:
for i in $(cat Libraries.list); do sbatch -J $i Trim.sh $i; done
This script will find read groups from the fastq files, provided you have normal fastq, and your files are named in our convention (SM__SRR.something.fastq.gz).
#!/bin/bash
#In a directory with the cleaned .fq.gz files, this will create ID (for us just SRR ID), SM (most important, for VCF files), LB (in case same library across multiple lanes), and PU (run data) fields. It will also count the number of SRRs per SM for merging later.
mkdir RGs
for i in $( ls *.trim.fastq.gz | sed 's/\..*//g' ); do
zcat $i.trim.fastq.gz | head -n 1 > ./RGs/$i.txt
awk 'BEGIN {FS=":"}{print FILENAME, $3, $4, $NF}' ./RGs/$i.txt > ./RGs/$i.tmp
ex -sc '%s/.txt//g' -c x ./RGs/$i.tmp
ex -sc '%s:./RGs/::g' -c x ./RGs/$i.tmp
#okay, now subet the parts we want
sed 's/__.*//g' ./RGs/$i.tmp > ./RGs/$i.SM
sed 's/.*__//g' ./RGs/$i.tmp | cut -d ' ' -f 1 > ./RGs/$i.ID
awk '{print $4}' ./RGs/$i.tmp > ./RGs/$i.tmp2
paste ./RGs/$i.SM ./RGs/$i.tmp2 | sed 's/\t/./g' > ./RGs/$i.LB
#run details for PU
zcat $i.trim.fastq.gz |head -n 1 |cut -f 3-5 -d ":" > ./RGs/$i.PU
#Determine number of runs per sample, and output to .numlibs file
ID="$(cat ./RGs/${i}.ID | tail -1)";
SM="$(cat ./RGs/${i}.SM | tail -1)";
echo ${ID} >> ./RGs/${SM}.libco
cat ./RGs/${SM}.libco | sort -u > ./RGs/${SM}.numlibs
done
rm ./RGs/*tmp*
rm ./RGs/*txt*
Take the gff file, remove the whole ‘region’ (chromosome length), the remaining space is intergenic:
bedtools sort -i GCA_017976375.1_bCucCan1.pri_genomic.CHR.gff | awk '$3 != "region"' | bedtools complement -i - -g GCA_017976375.1_bCucCan1.pri_genomic.CHR.genome.ordered -L > intergenic.bed
Grab introns:
#extract exons
awk '$3=="exon"' GCA_017976375.1_bCucCan1.pri_genomic.CHR.gff | bedtools sort > exons.bed
#extract genes
awk '$3=="gene"' GCA_017976375.1_bCucCan1.pri_genomic.CHR.gff | bedtools sort > genes.bed
# Get introns by subtracting exons from genes
bedtools subtract -a genes.bed -b exons.bed > introns.bed
awk '{OFS="\t"}{print $1, $4-1, $5}' introns.bed > introns_zerobased.bed
Now find degenerate sites using this github to identify 4-fold sites:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=2
#SBATCH --mem-per-cpu=4763mb
#SBATCH --time=48:00:00
GFF=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.gff
GENOME=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
python /dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/neutral/degeneracy/get_degeneracy.py $GFF $GENOME cuckoo
#convert to bed
awk '{OFS="\t"}{print $1, $2-1, $2}' cuckoo.codon3-fold4.pos > 4fold.bed
And then just merge the beds, and ensure you merge overlapping regions:
cat intergenic.bed introns_zerobased.bed 4fold.bed | \
bedtools sort -i - | \
bedtools merge -i - > /dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/extracting_neutral_sites/neutral-sites_cuckoo__intergenic-intron-4fold.bed
awk '{print $3- $2}' neutral-sites_cuckoo__intergenic-intron-4fold.bed | datamash sum 1
992158715
Create de novo repeat libraries
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=20
#SBATCH --time=200:00:00
RUN=$1
#conda activate repmod
#RepeatModeler version 2.0.3
BuildDatabase -name ${RUN} ${RUN}.chr.fa
RepeatModeler -database ${RUN} -pa 20 -LTRStruct >& ${RUN}.out
Filter TEs, largely following this tutorial:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_highmem
#SBATCH --cpus-per-task=10
#SBATCH --time=200:00:00
#conda activate te_ann
#USES cd-hit and pfam to identify redundant repeats and any which overlap proteins
pfamdb=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/wchrom/repeats/2023mar/pfam
#ensure there's no duplicate sequence IDs
seqkit rename Avian-families.fa > Avian-families.id.fa
#identify redundant hits
cd-hit-est -T 10 -i Avian-families.id.fa -o Avian-cdhit.fa -M 0 -d 0 -aS 0.8 -c 0.8 -G 0 -g 1 -b 500
#grab longest ORF
python -m jcvi.formats.fasta longestorf Avian-cdhit.fa > Avian-cdhit.orf.fasta
#translate as if they are vertebrate CDS
transeq -sequence Avian-cdhit.orf.fasta -outseq Avian-cdhit.orf.trns.fa -table 0 -frame 1
#and scan for proteins
pfam_scan.pl -fasta Avian-cdhit.orf.trns.fa -dir $pfamdb > pfam.results
#no hits found
Repeatmask by compartment (Z/W/A):
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=20
#SBATCH --time=200:00:00
#conda activate repmod
RUN=$1
#Now run repeatmasker with that library against the autosomes, W, and Z independently
for CHR in $(cat CHRS.list); do
seqtk subseq ${RUN}.chr.fa ${CHR}.list > ${RUN}-${CHR}.fa
RepeatMasker -pa 20 -a -s -gff -no_is -lib Avian-cdhit.fa ${RUN}-${CHR}.fa &> RMaves_${RUN}-${CHR}.run.out
done
Parse outputs:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_highmem
#SBATCH --cpus-per-task=3
#SBATCH --time=24:00:00
#conda activate orthofinder
RUN=$1
#Now run repeatmasker with that library against the autosomes, W, and Z independently
for CHR in $(cat CHRS.list); do
perl /dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/repeatmodeler/Parsing-RepeatMasker-Outputs/parseRM.pl -i ${RUN}-${CHR}.fa.align -p -f ${RUN}.chr.fa -r Avian-cdhit.fa -v
done
And now, grab the repeats in bed format:
awk '{OFS="\t"}{print "chr_"$5, $6, $7, $9, $10, $11}' *txt | bedtools sort -i - > ~/symlinks/c00c00/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats.bed
#but, let's retain low complexity and simple repeats, and allow MQ to deal with that:
egrep -v 'Simple|Low_complexity' GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats.bed > GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
repeats=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/miganc')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(viridis)
library(RColorBrewer)
library(ggpubr)
library(sf)
library(ggspatial)
library(spThin)
library(factoextra)
library(ggforce)
library(ggsci)
#read metadata
md = read_tsv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/miganc/Full_Metadata.txt')
#set up map and convert df to coordinate frame
world = map_data("world")
md = md %>% mutate(LatJit = jitter(Latitude,amount =1),
LonJit = jitter(Longitude,amount=1))
sites = st_as_sf(md, coords = c("LonJit", "LatJit"),
crs = 4326, agr = "constant")
spp = ggplot() +
geom_polygon(data = world, aes(x = long, y = lat, group = group), col='grey90', fill='white') +
geom_sf(data = sites,
aes(fill=Species,shape=Species),
size=2.5,alpha=0.8,show.legend = T,stroke=0.5) +
scale_fill_aaas()+
scale_shape_manual(values=c(21,24))+
xlab('') + ylab('') +
coord_sf(xlim = c(min(md$Longitude) - 15, max(md$Longitude) + 5),
ylim = c(min(md$Latitude) - 65, max(md$Latitude) + 5), expand = FALSE) +
theme_classic() +
theme(panel.border = element_rect(colour = "black", fill = NA, size = 1),
panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 6),
legend.title = element_text(size = 6),
legend.key.size = unit(0.1, 'cm'),
legend.position = 'top') +
annotation_scale(line_width = 0.5)
pdf('figures/AllSamples_Extent_2023NOV22.pdf',height=6,width=9)
spp
dev.off()
Plot migration data, popgen data:
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/miganc')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(viridis)
library(RColorBrewer)
library(ggpubr)
library(sf)
library(ggspatial)
library(spThin)
library(factoextra)
library(ggforce)
library(ggsci)
library(lubridate)
library(geosphere)
library(concaveman)
#read metadata
md = read_tsv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/miganc/Full_Metadata.txt')
md = md %>% mutate(Species = gsub('C. canorus','Cuculus canorus',Species),
Species = gsub('C. optatus','Cuculus optatus',Species))
#read spatial data
m = read_csv('03-BoD-deaths-removed-no-NDVI.csv')
m %>% select(dataset, individual.taxon.canonical.name,tag.local.identifier) %>% unique %>% count(individual.taxon.canonical.name)
#set up map and convert df to coordinate frame
world = map_data("world")
md = md %>% mutate(LatJit = jitter(Latitude,amount =1),
LonJit = jitter(Longitude,amount=1))
sites = st_as_sf(md, coords = c("LonJit", "LatJit"),
crs = 4326, agr = "constant")
tracks_df = m %>% filter(!grepl('satura',individual.taxon.canonical.name)) %>% select(individual.taxon.canonical.name,timestamp,Route,Wolf,tag.local.identifier,location.long,location.lat)
names(tracks_df) = c('Species','DT','Route','WolfID','ID','Long','Lat')
#assign 'year1', 'year2', 'year3', etc
tracks_df = tracks_df %>%
group_by(ID) %>%
mutate(DT = mdy_hm(DT)) %>%
arrange(DT) %>%
mutate(
Year_Num = cumsum(month(DT) == 11 & lag(month(DT), default = 10) != 11), # Increment each November
Year_Label = paste0("Year", Year_Num)
) %>%
ungroup()
tracks_df = tracks_df %>%
mutate(
Status = case_when(
month(DT) %in% c(12) ~ "Overwintering",
month(DT) %in% c(6) ~ "Breeding",
TRUE ~ NA_character_
)
)
# #remove any points further than 1000km
# tracks_df <- tracks_df %>%
# group_by(ID) %>%
# arrange(DT) %>%
# mutate(
# Prev_Long = lag(Long),
# Prev_Lat = lag(Lat),
# Distance = distHaversine(cbind(Prev_Long, Prev_Lat), cbind(Long, Lat))
# ) %>%
# ungroup() %>%
# mutate(Distance = Distance / 1000)
#
# tfx = tracks_df %>% drop_na(Distance) %>%
# filter(Distance < 250) #1000km x 1000m
#create points
tracks = st_as_sf(tracks_df, coords = c("Long", "Lat"),
crs = 4326, agr = "constant")
#create lines
tracks_lines <- tracks %>%
group_by(ID, Species) %>%
filter(n() >= 2) %>% # Keep only groups with at least two records
summarize(geometry = st_union(geometry)) %>%
st_cast("LINESTRING") %>%
st_as_sf()
#create convex hull polygons
#could only install concaveman and V8 with this:
#Sys.setenv(DOWNLOAD_STATIC_LIBV8 = 1)
#install.packages('V8')
counter = 0
for (stat in c('Breeding','Overwintering')) { for (sp in c('Cuculus canorus','Cuculus optatus')) { counter = counter + 1
t = tracks %>% filter(Status == stat & Species == sp)
p = concaveman(t,concavity=0.5,length_threshold=25)
p = p %>% mutate(Species = sp, Status = stat)
assign(paste0('p',counter),p)
}}
ps = rbind(p1,p2,p3,p4)
tpp = ggplot() +
geom_polygon(data = world, aes(x = long, y = lat, group = group), col='grey90', fill='white') +
geom_sf(data=tracks_lines,col='grey20',alpha=0.2,
size=0.15)+
# geom_sf(data = tracks %>% filter(Status == 'Breeding' | Status == 'Overwintering'),
# aes(fill=Status,shape=Species),
# size=2.5,alpha=0.8,show.legend = T,stroke=0.5) +
geom_sf(data = ps,aes(fill=Status),alpha=0.5)+
geom_sf(data = sites,pch=21,fill='white',col='black',size=2.5,alpha=0.8,show.legend = T,stroke=0.5) +
scale_fill_aaas()+
scale_color_brewer(palette = 'Set2')+
scale_shape_manual(values=c(16,15))+
facet_wrap(Species~.)+
xlab('') + ylab('') +
coord_sf(xlim = c(min(md$Longitude) - 15, max(md$Longitude) + 5),
ylim = c(min(md$Latitude) - 65, max(md$Latitude) + 5), expand = FALSE) +
theme_classic() +
theme(panel.border = element_rect(colour = "black", fill = NA, size = 1),
panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.key.size = unit(0.5, 'cm'),
legend.position = 'top') +
annotation_scale(line_width = 0.5)
tpp
pdf('figures/AllSamples_Extent-Species_2023NOV27.pdf',height=6,width=12)
tpp
dev.off()
#only BTO
uk_filtered <- tracks_df %>%
filter(Status == "Breeding",
Lat >= 49, Lat <= 61, # UK latitude boundaries
Long >= -8, Long <= 2) # UK longitude boundaries
#filter on distance
tracks_df <- tracks_df %>%
group_by(ID) %>%
arrange(DT) %>%
mutate(
Prev_Long = lag(Long),
Prev_Lat = lag(Lat),
Distance = distHaversine(cbind(Prev_Long, Prev_Lat), cbind(Long, Lat)) / 1000, # Distance in kilometers
Long_Segment = Distance > 2000 # Identify long segments
) %>%
ungroup() %>% drop_na(Distance)
#flag
tracks_df <- tracks_df %>%
group_by(ID) %>%
mutate(
Remove_Point = lag(Long_Segment, default = FALSE) | Long_Segment | lead(Long_Segment, default = FALSE)
) %>%
ungroup()
tfx = tracks_df %>%
filter(!Remove_Point)
bto_tracks = tracks_df %>% filter(ID %in% uk_filtered$ID)
#label as spring or fall migration
bto_tracks = bto_tracks %>%
mutate(Status = case_when(
month(DT) %in% c(3,4,5) ~ "Spring",
month(DT) %in% c(7,8,9) ~ "Fall",
TRUE ~ NA))
btosf = st_as_sf(bto_tracks, coords = c("Long", "Lat"),
crs = 4326, agr = "constant")
bto_lines <- btosf %>%
group_by(ID,Status,Route,WolfID) %>%
filter(n() >= 2) %>% # Keep only groups with at least two records
summarize(geometry = st_union(geometry)) %>%
st_cast("LINESTRING") %>%
st_as_sf()
bto_mig = bto_lines %>% filter(!is.na(Status)) %>% filter(Route == 'West' | Route == 'East') %>% filter(!is.na(WolfID))
bbox <- st_bbox(btosf)
up = ggplot() +
geom_polygon(data = world, aes(x = long, y = lat, group = group), col='grey90', fill='white') +
#geom_sf(data=bto_tracks,col='grey20',alpha=0.2,size=0.5)+
geom_sf(data=bto_mig,aes(col=Route),alpha=0.5,size=0.15)+
xlab('') + ylab('') +
coord_sf(xlim = c(bbox["xmin"], bbox["xmax"]), ylim = c(bbox["ymin"], bbox["ymax"]), expand = FALSE)+
theme_classic() +
facet_wrap(Status~.)+
theme(panel.border = element_rect(colour = "black", fill = NA, size = 1),
panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.key.size = unit(0.5, 'cm'),
legend.position = 'top') +
annotation_scale(line_width = 0.5)
pdf('figures/UKSamples_Route_2023NOV27.pdf',height=6,width=12)
up
dev.off()
Plotting on a spherical globe is easy using this code.
#### Plot Track data on 3D Globe
setwd('~/merondun/cuculus_migration/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(sf)
library(tidyverse)
library(ggspatial)
library(RColorBrewer)
library(giscoR)
library(ggpubr)
# Read metadata
md <- read_tsv('Full_Metadata.txt') %>%
mutate(
Species = str_replace_all(Species, c('C. canorus' = 'Cuculus canorus', 'C. optatus' = 'Cuculus optatus')),
LatJit = jitter(Latitude, amount = 1),
LonJit = jitter(Longitude, amount = 1)
)
# Read spatial data, filter according to Maire's filtering
final.locations <- read_csv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/spatial/03-BoD-deaths-removed-no-NDVI.csv')
# Calculate distance between each point
final.locations <- final.locations %>%
group_by(tag.local.identifier) %>%
mutate(long.to.sf = location.long,
lat.to.sf = location.lat) %>%
st_as_sf(coords = c("long.to.sf", "lat.to.sf"), crs = 4326) %>%
arrange(tag.local.identifier, as.POSIXct(timestamp, format = "%Y-%m-%d")) %>%
mutate(dist = c(0,as.numeric(st_distance(geometry[-1], geometry[-n()], by_element = T))/1000)) %>%
st_drop_geometry()
# Remove paths over 3000km?
large.gap <- which(final.locations$dist > 3000) # Which paths are large?
final.locations$large.gap <- F # Identify previous point before large path as cut off
final.locations$large.gap[large.gap-1] <- T
# Create function to identify large gap
gap.function <- function(var){
x <- rle(var)
cumsum(x[[2]]) |> (\(.){ .[which(!x[[2]], T)] <- NA ; .})() |> rep(x[[1]])
}
# Create new ID when gap is over 3000
final.locations <- final.locations %>%
group_by(tag.local.identifier) %>%
mutate(new_grp = gap.function(large.gap)) %>%
fill(new_grp, .direction = "up") %>%
mutate(new_id = paste0(tag.local.identifier, new_grp))
# Plot paths
ggplot() +
geom_path(final.locations, mapping = aes(location.long, location.lat, group = new_id))
# Create back-up, switch to Justin workflow and rename sensible columns
m <- final.locations
names(m)[names(m) == 'individual.taxon.canonical.name'] <- 'Species'
names(m)[names(m) == 'new_id'] <- 'ID'
# Add R interpretable datestring, add 'Breeding/Overwintering
tracks_df <- m %>%
filter(!str_detect(Species, 'satura')) %>%
select(Species, timestamp, Route, Wolf, ID, location.long, location.lat) %>%
rename(
DT = timestamp,
Long = location.long,
Lat = location.lat
) %>%
mutate(
DT = mdy_hm(DT),
Year_Num = cumsum(month(DT) == 11 & lag(month(DT), default = 10) != 11),
Year_Label = paste0("Year", Year_Num),
Status = case_when(
month(DT) %in% c(12) ~ "Overwintering",
month(DT) %in% c(6) ~ "Breeding",
TRUE ~ NA_character_
)
)
# Assign CCW / CCE / COW / COE based on longitude during June and July
# routes <- tracks_df %>%
# filter(month(DT) == 6 | month(DT) == 7) %>%
# group_by(tag.local.identifier,Species) %>%
# summarize(Long = mean(Long), Lat = mean(Lat)) %>%
# mutate(Direction = ifelse( Species == 'Cuculus canorus' & Long > 110, 'CCE',
# ifelse( Species == 'Cuculus canorus','CCW',
# ifelse( Species == 'Cuculus optatus' & Long > 115,'COE','COW')))) %>%
# select(-c(Long,Lat))
# Assign CCW / CCE / CO - 3 pop
routes <- tracks_df %>%
filter(month(DT) == 6 | month(DT) == 7) %>%
group_by(tag.local.identifier,Species) %>%
summarize(Long = mean(Long), Lat = mean(Lat)) %>%
mutate(Direction = ifelse( Species == 'Cuculus canorus' & Long > 110, 'CCE',
ifelse( Species == 'Cuculus canorus','CCW', 'CO')))%>%
select(-c(Long,Lat))
# BUT, some indivduals dont' have track data in June July, manually inspect and assign - they are all western
tracks_routes <- left_join(tracks_df, routes) %>%
mutate(Direction = ifelse(!is.na(Direction),Direction,
ifelse(Species == 'Cuculus canorus','CCW','CO')))
# Confirm
tracks_routes %>%
ggplot() +
geom_path(mapping = aes(Long, Lat, group = ID, col = Direction)) +
facet_grid(Species~.,scales='free')
# Convert to sf
tracks = st_as_sf(tracks_routes, coords = c("Long", "Lat"),
crs = 4326, agr = "constant")
# Colors for can and opt
cols = brewer.pal(8,'Paired')[c(3,4,7,8)]
# CANORUS: projection string used for the polygons & ocean background
crs_string <- "+proj=ortho +lon_0=60 +lat_0=30"
# background for the globe - center buffered by earth radius
ocean <- st_point(x = c(0,0)) %>%
st_buffer(dist = 6371000) %>%
st_sfc(crs = crs_string)
# country polygons, cut to size
world <- gisco_countries %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) # reproject to ortho
# Reproject into view of africa / asia
reproj_can <- tracks %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) %>% # reproject to ortho
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
filter(Species == 'Cuculus canorus') %>%
st_transform(crs = crs_string)
# now the action!
can_plot <- ggplot(data = world) +
geom_sf(data = ocean, fill = "cadetblue1", color = NA) + # background first
geom_sf(lwd = .1,col='grey80',fill='grey99') + # now land over the oceans
geom_path(reproj_can, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.5) + #and the cuckoos
scale_color_manual(values=cols[1:2])+
theme_void()
can_plot
# OPTATUS: projection string used for the polygons & ocean background
crs_string <- "+proj=ortho +lon_0=100 +lat_0=30"
# background for the globe - center buffered by earth radius
ocean <- st_point(x = c(0,0)) %>%
st_buffer(dist = 6371000) %>%
st_sfc(crs = crs_string)
# country polygons, cut to size
world <- gisco_countries %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) # reproject to ortho
# Reproject into view of africa / asia
reproj_opt <- tracks %>%
filter(Species == 'Cuculus optatus') %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) %>% # reproject to ortho
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
st_transform(crs = crs_string)
# now the action!
opt_plot <- ggplot(data = world) +
geom_sf(data = ocean, fill = "cadetblue1", color = NA) + # background first
geom_sf(lwd = .1,col='grey80',fill='grey99') + # now land over the oceans
geom_path(reproj_opt, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.8) + # More opaque for optatus to help distinguish
scale_color_manual(values=cols[4])+ #and the colors
theme_void()
opt_plot
ggsave('figures/20241218_Spherical-Tracks-Filtered-CanOpt-3Pop.pdf',
ggarrange(can_plot,opt_plot),height=4,width=9,dpi=600)
saveRDS(tracks, "/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/spatial/tracks_data.rds")
#### Plot Track data on 3D Globe
setwd('~/merondun/cuculus_migration/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(qdrmapbox)
library(raster)
library(sf)
library(tidyverse)
library(ggspatial)
library(RColorBrewer)
library(giscoR)
library(ggpubr)
# Read metadata
md <- read_tsv('Full_Metadata.txt') %>%
mutate(
Species = str_replace_all(Species, c('C. canorus' = 'Cuculus canorus', 'C. optatus' = 'Cuculus optatus')),
LatJit = jitter(Latitude, amount = 1),
LonJit = jitter(Longitude, amount = 1)
)
# Read spatial data, filter according to Maire's filtering
final.locations <- read_csv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/spatial/03-BoD-deaths-removed-no-NDVI.csv')
# Calculate distance between each point
final.locations <- final.locations %>%
group_by(tag.local.identifier) %>%
mutate(long.to.sf = location.long,
lat.to.sf = location.lat) %>%
st_as_sf(coords = c("long.to.sf", "lat.to.sf"), crs = 4326) %>%
arrange(tag.local.identifier, as.POSIXct(timestamp, format = "%Y-%m-%d")) %>%
mutate(dist = c(0,as.numeric(st_distance(geometry[-1], geometry[-n()], by_element = T))/1000)) %>%
st_drop_geometry()
# Remove paths over 3000km?
large.gap <- which(final.locations$dist > 3000) # Which paths are large?
final.locations$large.gap <- F # Identify previous point before large path as cut off
final.locations$large.gap[large.gap-1] <- T
# Create function to identify large gap
gap.function <- function(var){
x <- rle(var)
cumsum(x[[2]]) |> (\(.){ .[which(!x[[2]], T)] <- NA ; .})() |> rep(x[[1]])
}
# Create new ID when gap is over 3000
final.locations <- final.locations %>%
group_by(tag.local.identifier) %>%
mutate(new_grp = gap.function(large.gap)) %>%
fill(new_grp, .direction = "up") %>%
mutate(new_id = paste0(tag.local.identifier, new_grp))
# Plot paths
ggplot() +
geom_path(final.locations, mapping = aes(location.long, location.lat, group = new_id))
# Create back-up, switch to Justin workflow and rename sensible columns
m <- final.locations
names(m)[names(m) == 'individual.taxon.canonical.name'] <- 'Species'
names(m)[names(m) == 'new_id'] <- 'ID'
# Add R interpretable datestring, add 'Breeding/Overwintering
tracks_df <- m %>%
filter(!str_detect(Species, 'satura')) %>%
select(Species, timestamp, Route, Wolf, ID, location.long, location.lat) %>%
rename(
DT = timestamp,
Long = location.long,
Lat = location.lat
) %>%
mutate(
DT = mdy_hm(DT),
Year_Num = cumsum(month(DT) == 11 & lag(month(DT), default = 10) != 11),
Year_Label = paste0("Year", Year_Num),
Status = case_when(
month(DT) %in% c(12) ~ "Overwintering",
month(DT) %in% c(6) ~ "Breeding",
TRUE ~ NA_character_
)
)
# Assign CCW / CCE / COW / COE based on longitude during June and July
# routes <- tracks_df %>%
# filter(month(DT) == 6 | month(DT) == 7) %>%
# group_by(tag.local.identifier,Species) %>%
# summarize(Long = mean(Long), Lat = mean(Lat)) %>%
# mutate(Direction = ifelse( Species == 'Cuculus canorus' & Long > 110, 'CCE',
# ifelse( Species == 'Cuculus canorus','CCW',
# ifelse( Species == 'Cuculus optatus' & Long > 115,'COE','COW')))) %>%
# select(-c(Long,Lat))
# Assign CCW / CCE / CO - 3 pop
routes <- tracks_df %>%
filter(month(DT) == 6 | month(DT) == 7) %>%
group_by(tag.local.identifier,Species) %>%
summarize(Long = mean(Long), Lat = mean(Lat)) %>%
mutate(Direction = ifelse( Species == 'Cuculus canorus' & Long > 110, 'CCE',
ifelse( Species == 'Cuculus canorus','CCW', 'CO')))%>%
select(-c(Long,Lat))
# BUT, some indivduals dont' have track data in June July, manually inspect and assign - they are all western
tracks_routes <- left_join(tracks_df, routes) %>%
mutate(Direction = ifelse(!is.na(Direction),Direction,
ifelse(Species == 'Cuculus canorus','CCW','CO')))
# Confirm
tracks_routes %>%
ggplot() +
geom_path(mapping = aes(Long, Lat, group = ID, col = Direction)) +
facet_grid(Species~.,scales='free')
# Convert to sf
tracks = st_as_sf(tracks_routes, coords = c("Long", "Lat"),
crs = 4326, agr = "constant")
# Colors for can and opt
cols = brewer.pal(8,'Paired')[c(3,4,7,8)]
# CANORUS: projection string used for the polygons & ocean background
crs_string <- "+proj=ortho +lon_0=60 +lat_0=30"
# background for the globe - center buffered by earth radius
ocean <- st_point(x = c(0,0)) %>%
st_buffer(dist = 6371000) %>%
st_sfc(crs = crs_string)
# country polygons, cut to size
world <- gisco_countries %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) # reproject to ortho
# Reproject into view of africa / asia
reproj_can <- tracks %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) %>% # reproject to ortho
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
filter(Species == 'Cuculus canorus') %>%
st_transform(crs = crs_string)
# now the action!
can_plot <- ggplot(data = world) +
geom_sf(data = ocean, fill = "cadetblue1", color = NA) + # background first
geom_sf(lwd = .1,col='grey80',fill='grey99') + # now land over the oceans
geom_path(reproj_can, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.5) + #and the cuckoos
scale_color_manual(values=cols[1:2])+
theme_void()
can_plot
# OPTATUS: projection string used for the polygons & ocean background
crs_string <- "+proj=ortho +lon_0=100 +lat_0=30"
# background for the globe - center buffered by earth radius
ocean <- st_point(x = c(0,0)) %>%
st_buffer(dist = 6371000) %>%
st_sfc(crs = crs_string)
# country polygons, cut to size
world <- gisco_countries %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) # reproject to ortho
# Reproject into view of africa / asia
reproj_opt <- tracks %>%
filter(Species == 'Cuculus optatus') %>%
st_intersection(ocean %>% st_transform(4326)) %>% # select visible area only
st_transform(crs = crs_string) %>% # reproject to ortho
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
st_transform(crs = crs_string)
# now the action!
opt_plot <- ggplot(data = world) +
geom_sf(data = ocean, fill = "cadetblue1", color = NA) + # background first
geom_sf(lwd = .1,col='grey80',fill='grey99') + # now land over the oceans
geom_path(reproj_opt, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.8) + # More opaque for optatus to help distinguish
scale_color_manual(values=cols[4])+ #and the colors
theme_void()
opt_plot
ggsave('figures/20241218_Spherical-Tracks-Filtered-CanOpt-3Pop.pdf',
ggarrange(can_plot,opt_plot),height=4,width=9,dpi=600)
saveRDS(tracks, "/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/spatial/tracks_data.rds")
### Satelltie
library(ggmap)
register_google(key = "AIzaSyDzjCgFf9hhFIBkVJ4tAyy6cjjq9BjmG4U")
# Canorus
can_flat <- tracks %>%
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
filter(Species == 'Cuculus canorus')
can_bb <- st_bbox(can_flat)
names(can_bb) <- c('left','bottom','right','top')
can_bb_buffered <- can_bb + c(-buffer_sides, -buffer_tops, buffer_sides, buffer_tops)
map_data <- get_map(location = can_bb, zoom = 2, source = "google", maptype = 'satellite')
base_map <- ggmap(map_data)
base_map
# now the action!
can_plot <- base_map +
borders("world", col = "gray90", size = 0.05,alpha=0.5)+
geom_path(can_flat, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.5) + #and the cuckoos
scale_color_manual(values=cols[1:2])+
theme_void()
can_plot
# Optatus
opt_flat <- tracks %>%
mutate(Long = st_coordinates(geometry)[,1],
Lat = st_coordinates(geometry)[,2]) %>%
filter(Species == 'Cuculus optatus')
opt_bb <- st_bbox(opt_flat)
names(opt_bb) <- c('left','bottom','right','top')
map_data <- get_map(location = opt_bb, zoom = 2, source = "google", maptype = 'satellite')
base_map <- ggmap(map_data)
base_map
# now the action!
opt_plot <- base_map +
borders("world", col = "gray90", size = 0.05,alpha=0.5)+
geom_path(opt_flat, mapping = aes(x = Long, y = Lat, group = ID, col = Direction),
alpha=0.5,col=cols[4]) + #and the cuckoos
theme_void()
opt_plot
both <- ggarrange(can_plot,opt_plot)
ggsave('figures/20241218_Satellite_3Pop.pdf',dpi=300,both,height=6,width=9)
Plot syllable duration and pitch:
setwd('C:/Users/herit/Dropbox/CUCKOO_migration/Manuscript/Figures/Figure_1_TracksPopGenStructure/')
library(tidyverse)
library(RColorBrewer)
sounds <- read_csv('Cuckoos sounds/sounds.csv')
# Retain species
analyze <- sounds %>% filter(grepl('can|opt',Species))
cols <- c('#b2df8a','#33a02c','#fe9024')
pca_plot <- analyze %>%
ggplot() +
geom_point(aes(x=`Dur of syllable`,y=`Pitch of syll`,fill = Species,shape=Species),
size = 2, color = 'grey20', alpha = 0.9) +
theme_bw(base_size=8) +
scale_fill_manual(values=cols)+
scale_shape_manual(values=c(21,21,24))+
xlab(paste0('Duration of Syllable (s)'))+
ylab(paste0('Pitch of Syllable (Hz)'))
pca_plot
ggsave('Cuckoos sounds/20250212_Simple2Var-Sounds-CanOpt.pdf',units='in',dpi=300,height=2,width=3)
PCA:
setwd('C:/Users/herit/Dropbox/CUCKOO_migration/Manuscript/Figures/Figure_1_TracksPopGenStructure/')
library(tidyverse)
library(RColorBrewer)
scores <- read_csv('Cuckoos sounds/PCA_Scores.csv')
ids <- read_csv('Cuckoos sounds/sounds.csv')
vals <- read_csv('Cuckoos sounds/importance 2s.csv')
vt <- vals %>% filter(grepl('Proportion of',`...1`)) %>% select(-`...1`) %>% t() %>% as.data.frame
vt$Axis <- rownames(vt)
rownames(vt) <- NULL
p <- left_join(scores %>% dplyr::rename(Number = `...1`),ids)
cols <- c('#b2df8a','#33a02c','#fe9024')
pca_plot <- p %>%
ggplot() +
geom_point(aes(x=PC1,y=-PC2,fill = Species,shape=Species),
size = 2, color = 'grey20', alpha = 0.9) +
theme_bw(base_size=8) +
scale_fill_manual(values=cols)+
scale_shape_manual(values=c(21,21,24))+
xlab(paste0('PC1 (',round(vt[1,1],3)*100,'%)'))+
ylab(paste0('PC2 (',round(vt[2,1],3)*100,'%)'))
pca_plot
ggsave('Cuckoos sounds/20250115_PCA-Sounds-CanOpt.pdf',units='in',dpi=300,height=2,width=3)
setwd('C:/Users/herit/Dropbox/CUCKOO_migration/Manuscript/Figures/Figure_3_Hindcasting/Files for Fig 3/')
library(tidyverse)
library(readxl)
library(ggforce)
library(RColorBrewer)
# Load the Excel file, skipping the first row
df <- read_excel("BreedWinterOverlap3.xlsx",sheet = 'R_Input') %>% as_tibble()
df <- df %>% mutate(year_ka = year / 1000)
# Add temp periods (For demography! )
periods <- tibble(
xmin = c(0, 23750, 121500, 56750),
xend = c(10000, 33750, 131500, 66750),
col = c("hot", "cold", "hot", "cold"),
label = c("t0", "t0", "t1", "t1"),
y = c(0.005, -7.6, 1.94, -6.46)
) %>%
mutate(xmin = xmin / 1000,
xend = xend / 1000)
# Temperature
temp <- ggplot(df, aes(x = year_ka, y = Temp, color = Temp)) +
geom_line() +
scale_color_gradientn(colors = c("blue", "red")) +
geom_rect(data=periods,aes(xmin=xmin,xmax=xend,ymin=-Inf,ymax=Inf,fill=col),inherit.aes=FALSE,alpha=0.25)+
scale_fill_manual(values=c('cyan3','salmon2'))+
scale_x_reverse(limits=c(800,0),
breaks = seq(0, 800, by = 100),
labels = function(x) paste0(x, " Ka")) +
labs(y = "Temperature", x = "Year") +
facet_grid(1~.,scales='free')+
theme_bw(base_size=8) +
theme(legend.position='none')
temp
ggsave('20250206_Temperature.pdf',temp,height=0.65,width=6.5,dpi=600,units='in')
# Gap between breeding / overwintering
df_long <- df %>% dplyr::rename(East = east_diff, West = west_diff) %>%
pivot_longer(
cols = c(West, East),
names_to = "region",
values_to = "difference"
)
breeding <- ggplot(df_long, aes(x = year_ka, y = difference, group = region)) +
geom_link2(aes(colour = after_stat(ifelse(y > 0, "Overlap", "Gap"))),
linewidth = 0.5) +
scale_x_reverse(limits=c(800,0),
breaks = seq(0, 800, by = 100),
labels = function(x) paste0(x, " Ka")) +
geom_hline(yintercept = 0, lty = 2) +
geom_rect(data=periods,aes(xmin=xmin,xmax=xend,ymin=-Inf,ymax=Inf,fill=col),inherit.aes=FALSE,alpha=0.25)+
scale_fill_manual(values=c('cyan3','salmon2'))+
scale_color_manual(values=c('darkorchid','seagreen','darkblue','yellow'))+
labs(y = "Overlap (+) or Gap (-)", x = "Year") +
scale_y_continuous(limits = c(-40, 40)) +
#facet_grid(region ~ ., scales = "free") + #for separated east/west
theme_bw(base_size = 8)+
theme(legend.position='none')
breeding
df_long <- df_long %>%
mutate(color = case_when(
region == "East" & difference >= 0 ~ "A",
region == "West" & difference >= 0 ~ "B",
region == "East" & difference < 0 ~ "C",
region == "West" & difference < 0 ~ "D"
))
breeding <- ggplot(df_long, aes(x = year_ka, y = difference, group = region)) +
geom_link2(aes(colour = color), linewidth = 0.5) +
scale_x_reverse(limits = c(800, 0),
breaks = seq(0, 800, by = 100),
labels = function(x) paste0(x, " Ka")) +
geom_hline(yintercept = 0, lty = 2) +
geom_rect(data = periods, aes(xmin = xmin, xmax = xend, ymin = -Inf, ymax = Inf, fill = col), inherit.aes = FALSE, alpha = 0.25) +
scale_fill_manual(values = c('cyan3', 'salmon2')) +
scale_color_manual(values = c('darkblue', 'blue', 'darkgreen', 'lightgreen')) +
labs(y = "Overlap (+) or Gap (-)", x = "Year") +
scale_y_continuous(limits = c(-40, 40)) +
theme_bw(base_size = 8) +
theme(legend.position = 'none')
breeding
# The line itself is difficult to vectorize because it is thousands of segments, export once as png, and then hash out the link() lines and resave
ggsave('20250206_BreedingOverlap.png',breeding,height=0.95,width=6.5,dpi=600,units='in')
ggsave('20250206_BreedingOverlapAxes.pdf',breeding,height=0.95,width=6.5,dpi=600,units='in')
# Gaps
df_tracks <- df %>%
pivot_longer(c(EWgap, Sahara0Gap,IndianOcean))
df_tracks$name <- factor(df_tracks$name,levels=c('EWgap','Sahara0Gap','IndianOcean'))
tracks <- df_tracks %>%
ggplot(aes(x = year_ka, y = value,col=name)) +
geom_line(linewidth = 0.75) +
geom_hline(yintercept=0.5,lty=2,col='grey30')+
scale_x_reverse(limits=c(800,0),
breaks = seq(0, 800, by = 100),
labels = function(x) paste0(x, " Ka")) +
geom_rect(data=periods,aes(xmin=xmin,xmax=xend,ymin=-Inf,ymax=Inf,fill=col),inherit.aes=FALSE,alpha=0.25)+
scale_fill_manual(values=c('cyan3','salmon2'))+
labs(y = "Temperature", x = "Year")+
scale_color_manual(values=brewer.pal(3,'Set2'))+
#facet_grid(name~.,scales='free')+
theme_bw(base_size=8) +
coord_cartesian(ylim=c(0,1))
theme(legend.position='none')
tracks
ggsave('20250206_Gaps.pdf',tracks,height=0.95,width=6.5,dpi=600,units='in')
Identify warm and cold periods:
.libPaths('~/mambaforge/envs/R/lib/R/library')
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/unrelated_chyiyin/crosscoal/output')
library(tidyverse)
#import ice core data
icecore = read_tsv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Ice_Temperature_Reconstructions_Kawamura_NOAA-6076.txt')
icecore = icecore %>% dplyr::rename(time = TopAge)
# Hot times, recent
hot_0 <- icecore %>% filter(time < 5e4) %>% slice_max(deltaT,n=20) %>%
summarize(time_low = 0, time_high = max(time), temp = mean(deltaT), duration = time_high - time_low)
# Cold times, recent
cold_0 <- icecore %>% filter(time < 5e4) %>% slice_min(deltaT,n=20) %>%
summarize(time_low = min(time), time_high = max(time), temp = mean(deltaT), duration = time_high - time_low)
# Hot times, mid
hot_1 <- icecore %>% filter(time < 1.75e5 & time > 5e4) %>% slice_max(deltaT,n=20) %>%
summarize(time_low = min(time), time_high = max(time), temp = mean(deltaT), duration = time_high - time_low)
# Cold times, mid
cold_1 <- icecore %>% filter(time < 1e5 & time > 5e4) %>% slice_min(deltaT,n=20) %>%
summarize(time_low = min(time), time_high = max(time), temp = mean(deltaT), duration = time_high - time_low)
periods <- rbind(hot_0, cold_0, hot_1, cold_1) %>%
mutate(stage = c('hot','cold','hot','cold'),
period = c('t0','t0','t1','t1'))
periods
# Adjusting durations to be exactly 10000
adjusted_periods <- periods %>%
mutate(
mid_point = (time_low + time_high) / 2, #Calculate midpoint
time_low = if_else(duration > 10000, mid_point - 5000, time_low),
time_high = if_else(duration > 10000, mid_point + 5000, time_high),
duration = 10000 # Set duration to 10000
) %>%
select(-mid_point) # Remove the mid_point column
# Output the adjusted dataframe
adjusted_periods
#Modify recent time so that it occurs from contemporary time
adjusted_contemp <- adjusted_periods %>% mutate(
time_low = ifelse(stage == 'hot' & period == 't0',0,time_low),
time_high = ifelse(stage == 'hot' & period == 't0',10000,time_high)) %>%
select(-temp) %>% #recalculate temps within this 10KY
rowwise() %>%
mutate(
temp = mean(icecore$deltaT[icecore$time >= time_low & icecore$time <= time_high], na.rm = TRUE)
)
adjusted_contemp
# Kawamura NOAA 6076 Ice core inferred temperature
ice_plot <- icecore %>%
ggplot(aes(x = time, y = deltaT)) +
geom_line()+
coord_cartesian(xlim=c(0,1.5e5))+
geom_rect(data=adjusted_contemp,aes(xmin=time_low,xmax=time_high,ymin=-Inf,ymax=Inf,fill=stage),
alpha=0.5,inherit.aes=FALSE)+
scale_fill_manual(values=c('cyan3','salmon2'))+
geom_label(data=adjusted_contemp,size=1.5,aes(x=time_low+5500,y=4,
label=paste0(stage,': ',period,' ',round(temp,2),'°C\n',time_low/1000,' - ',time_high/1000,'Ka')),col='black')+
xlab('Time')+ylab('Delta T (°C)')+
theme_test(base_size=10)
ice_plot
png('~/merondun/cuculus_migration/figures/WarmCold_Periods.png',units='in',res=300,height=2,width=3.5)
pdf('~/merondun/cuculus_migration/figures/WarmCold_Periods.pdf',height=2,width=3.5)
ice_plot
dev.off()
| Start | End | Duration | Stage | Period | Mean Temperature |
|---|---|---|---|---|---|
| 0 | 10000 | 10000 | hot | t0 | 0.005 |
| 23750 | 33750 | 10000 | cold | t0 | -7.6 |
| 121500 | 131500 | 10000 | hot | t1 | 1.94 |
| 56750 | 66750 | 10000 | cold | t1 | -6.46 |
Still on the 658 individual SRR accessions. Submit positional library.
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=10
#SBATCH --mem-per-cpu=4763mb
#SBATCH --time=12:00:00
SCRATCH=tmp/$SLURM_JOB_ID
wd=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/trimmed_fastq
qcdata=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/trimmed_fastq
outdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/bams/2021_04
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
RUN=$1
bwa index ${genome}
#Mapping script, will map reads with the prefix identified in the list ${lists}/${RUN}
ID="$(cat ${wd}/RGs/${RUN}.ID | tail -1)";
PU="$(cat ${wd}/RGs/${RUN}.PU | tail -1)";
SM="$(cat ${wd}/RGs/${RUN}.SM | tail -1)";
LB="$(cat ${wd}/RGs/${RUN}.LB | tail -1)";
#library mapping
bwa mem -M -p -t 10 -R "@RG\tID:${ID}\tSM:${SM}\tPL:ILLUMINA\tPU:${PU}\tLB:${LB}" ${genome} ${qcdata}/${RUN}.trim.fastq.gz | samtools sort -@10 -o ${SCRATCH}/${RUN}.bam -;
samtools view -b -f 1 -F 524 ${SCRATCH}/${RUN}.bam > ${outdir}/${RUN}.bam
samtools index -b ${outdir}/${RUN}.bam;
And submit by sample:
for sample in $(cat Libraries.list); do sbatch -J ${sample} Align.sh ${sample}; done
Plot Alignment:
setwd('F:/Research/scratch/cuckoo/2022_04/QC/alignments/')
library(ggplot2)
library(tidyverse)
library(dplyr)
library(ggsci)
library(ggpubr)
library(viridis)
library(openxlsx)
#import alignments
aln <- read.table('Alignments.txt',header=T)
aln <- aln %>% rename(ID = SAMPLE)
coord <- read.xlsx('D:/Sync/JM/Research/Cuckoo/Cuculus_Metadata_n404.xlsx')
alnm <- merge(aln,coord,by='ID')
raw <- read.table('reads.txt',header=T)
raw <- raw %>% group_by(ID) %>% summarize(total=sum(Reads))
ra <- merge(alnm,raw,by='ID')
ra$aligned <- ra$TOTAL_READS/1000000
ra$rate <- ra$aligned/ra$total
cs1 <- subset(ra,Species != 'EC' & Sex != 'U' & Species != 'HH')
cs1 <- cs1 %>% mutate(Species = gsub('CC','C. canorus canorus',Species),
Species = gsub('CB','C. canorus bakeri',Species),
Species = gsub('CO','C. optatus',Species),
Species = gsub('CP','C. poliocephalus',Species),
Species = gsub('CM','C. micropterus',Species),
Species = gsub('CS','C. saturatus',Species))
cs1$Species <- factor(cs1$Species,levels=c('C. canorus canorus','C. canorus bakeri','C. optatus','C. saturatus','C. micropterus','C. poliocephalus'))
ccol <- coord %>% select(DistanceClade,DistanceColor) %>% unique()
cp <- cs1 %>% ggplot(aes(x=Structure_Order,y=rate,col=DistanceClade,shape=Species))+
geom_point()+
scale_color_manual(values=col$DistanceColor,
breaks=col$DistanceClade)+
scale_shape_manual(values=c('C. canorus canorus'=16,'C. canorus bakeri'=15,'C. optatus'=17,'C. saturatus'=6,'C. micropterus'=3,'C. poliocephalus'=4))+
facet_grid(.~Sex,scale='free')+
ylab('Mean Coverage')+xlab('Sample (Same Order as Structure Plots)')+
theme_minimal()+
labs(color='K-Means Distance Clade') +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1))
cp
cs1 %>% group_by(Species) %>% summarize(mean=mean(rate))
png('Alignments_Cuckoo_All.png',height=7,width=10,res=300,bg='white',units='in')
cp
dev.off()
This script will merge, mark duplicates, remove reads overhanging scaffold ends, and summarize (coverage + alignments). Submit positional SAMPLE (n=390).
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=3
#SBATCH --mem-per-cpu=4763mb
#SBATCH --time=12:00:00
SCRATCH=tmp/$SLURM_JOB_ID
#Bam cleaning script. Merge by sample, Mark duplicates, remove reads overhanging scaffolds, and summarize
RUN=$1
RGdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/trimmed_fastq/RGs
bamdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/bams/2021_04
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
mkdir tmp
mkdir $SCRATCH
#Make final bam and stats folder
mkdir ${bamdir}/merged
mkdir ${bamdir}/merged/stats
mkdir ${bamdir}/merged/stats/alignments
mkdir ${bamdir}/merged/stats/coverage
#merge all the libraries together
samtools merge - ${bamdir}/${RUN}*.bam | samtools sort -@3 -o $SCRATCH/${RUN}.bam
samtools index $SCRATCH/${RUN}.bam
sambamba markdup --tmpdir $SCRATCH/ $SCRATCH/${RUN}.bam $SCRATCH/${RUN}.dup.bam
samtools index $SCRATCH/${RUN}.dup.bam
#Remove reads over scaffold end
gatk CleanSam -I $SCRATCH/${RUN}.dup.bam -O ${bamdir}/merged/${RUN}.bam -R ${genome} --CREATE_INDEX true
Summary Stats
gatk CollectAlignmentSummaryMetrics -R ${genome} -I ${bamdir}/merged/${RUN}.bam -O ${bamdir}/merged/stats/alignments/${RUN}.alignments.txt -LEVEL SAMPLE -LEVEL READ_GROUP
#calculate coverage
mosdepth --threads 3 --use-median --by 100000 --fast-mode --no-per-base ${bamdir}/merged/stats/coverage/${RUN} ${bamdir}/merged/${RUN}.bam
Sample Stats
Alignments
Check out alignment stats within /stats/alignments/
mkdir output
grep 'TOTAL' 001_CB_ORW_CHN_F.alignments.txt > output/Alignments.txt
for i in $(ls *.alignments.txt | sed 's/\..*//g'); do
awk 'NR==13' ${i}.alignments.txt >> output/Alignments.txt
done
Coverage
Within /coverage/
#take the unzipped bed.gz file from mosdepth, and change it into the below
for i in $(ls *.regions.bed | rev | cut -c13- | rev); do awk -v s=${i} 'BEGIN{print s}; {print $4}' ${i}.regions.bed > ${i}.cov; done
#create header file with chr / start / end, since all files on the same genome, they all have the same length and this is consistent across samples
awk '{OFS="\t"}BEGIN{print "CHR","START","END"};{print $1, $2, $3}' 053_CC_RED_FIN_F.regions.bed | sed 's/\ /\t/g' > 0001Header.cov
#combine them all together..
paste *.cov > Master.coverage
Also print the GW mean of median:
for i in $(ls *.regions.bed | rev | cut -c13- | rev); do awk -v s=${i} '{ total += $4 } END { print s, total/NR }' ${i}.regions.bed >> gw_median.coverage ; done
And by chromosome:
for i in $(ls *.summary.txt | sed 's/\..*//g'); do
grep -i -w 'CM030676.1\|CM030677.1\|total\|CM030679.1' ${i}.mosdepth.summary.txt | awk -v s=${i} 'BEGIN{OFS="\t"}{print s, $1, $4}' >> summarize/Compartment.coverage.txt ; done
Split Genome
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
#create dict
gatk CreateSequenceDictionary -R $genome
#split genome by Ns
gatk ScatterIntervalsByNs -R $genome -O scatter.interval_list -OT ACGT
#split intervals
gatk SplitIntervals -R $genome -L scatter.interval_list --scatter-count 100 -O interval_files/
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=5
#SBATCH --time=240:00:00
vcfdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/2022_11/raw_vcf
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
ints=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/2022_11/snpcalling/intervals
CHR=$1
GROUP=$2
mkdir $vcfdir
if [[ $CHR == 0102 ]]
then
maxdp=20000
else
maxdp=100
fi
echo "WORKING ON INTERVAL ${CHR} WITH MAX DEPTH ${maxdp}"
#bcftools 1.16
bcftools mpileup --max-depth ${maxdp} -C 50 -threads 5 -f ${genome} -b ${GROUP}.bam -R $ints/$CHR.bed -a "AD,ADF,ADR,DP,SP" | \
bcftools call --ploidy 2 --threads 5 -m -Oz -o ${vcfdir}/${CHR}_bcft.vcf.gz
Merge by chromosome
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_highmem
#SBATCH --cpus-per-task=5
#SBATCH --time=200:00:00
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
mkdir ../merged
CHR=$1
echo "${CHR}"
mask=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/${CHR}.consmask
bcftools concat --file-list VCFs.list --threads 5 -a -r ${CHR} -Ou | \
bcftools sort -Oz -o ../merged/${CHR}.vcf.gz -
bcftools index --threads 5 ../merged/${CHR}.vcf.gz
Assign samples within 500km of one another as a potential distance group, we will only examine relatedness within these groups.
#### Determine relatedness with PHI statistic
setwd('~/merondun/cuculus_migration/relatedness/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
#Igraph approach
library(tidyverse)
library(RColorBrewer)
library(geosphere)
library(igraph)
library(spThin)
library(sf)
library(ggspatial)
library(factoextra)
library(ggpubr)
library(viridis)
# Read metadata and filter for necessary columns
md = read_tsv('../Full_Metadata.txt')
species = c('CC','CO')
world <- map_data("world")
set.seed(9999)
#Find samples within specified distance
assign_clusters_based_on_distance <- function(ds) {
#Calculate pairwise distances
distances <- geosphere::distm(ds[, c("Longitude", "Latitude")])
# Create an adjacency matrix where 1 represents distances within 500km
adjacency_matrix <- distances <= 500000
# Create a graph frm the adjacency matrix
g <- graph_from_adjacency_matrix(adjacency_matrix, mode = "undirected", diag = FALSE)
# Find connected clusters
clusters <- components(g)$membership
return(clusters)
}
#apply to each species
ds_clustered <- md %>%
filter(grepl('CC|CO',SpeciesShort)) %>%
group_by(SpeciesShort) %>%
mutate(cluster = assign_clusters_based_on_distance(cur_data())) %>%
ungroup()
dist_jit = ds_clustered %>% mutate(Cluster = paste0(SpeciesShort,'_',cluster))
colshape = dist_jit %>% ungroup %>% select(Cluster) %>% unique %>%
mutate(ClustCol = rep_len(c(viridis(12, option = 'turbo'),brewer.pal(12, 'Paired')), 35), ClustShape = rep_len(c(21, 24, 25, 22,23), 35))
dist_site = st_as_sf(dist_jit, coords = c("Longitude", "Latitude"), crs = 4326, agr = "constant")
distp = ggplot() +
geom_polygon(data = world, aes(x = long, y = lat, group = group),col='grey50',fill='grey95') +
geom_sf(data = dist_site,
aes(fill=Cluster,shape=Cluster),
size=3,show.legend = T) +
xlab('Longitude')+ylab('Latitude')+
scale_fill_manual(values=colshape$ClustCol,breaks=colshape$Cluster)+
scale_shape_manual(values=colshape$ClustShape,breaks=colshape$Cluster)+
coord_sf(xlim = c(min(ds_clustered$Longitude)-5, max(ds_clustered$Longitude)+5),
ylim = c(min(ds_clustered$Latitude)-5, max(ds_clustered$Latitude)+5), expand = FALSE)+
theme_classic()+
facet_grid(SpeciesShort ~ .)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))+
annotation_scale(line_width=0.5)
distp
pdf('~/merondun/cuculus_migration/relatedness/DistanceClusters_2024FEB28.pdf',height=6,width=7)
distp
dev.off()
write.table(dist_jit %>% ungroup %>% select(ID,Cluster),'~/merondun/cuculus_migration/relatedness/DistanceClusters_2024FEB28.txt',quote=F,sep='\t',row.names=F)
Filter at 4 levels. Have this python script available:
import sys
import pysam
# Take inputs from command line arguments
outgroup_samples_file = sys.argv[1]
input_vcf_file = sys.argv[2]
output_prefix = sys.argv[3]
# Load outgroup samples from the input file
with open(outgroup_samples_file, 'r') as file:
outgroup_samples = [line.strip() for line in file]
# Open the input VCF file
with pysam.VariantFile(input_vcf_file) as vcf_in:
# Add 'AA' to the INFO fields of the header
vcf_in.header.info.add('AA', '1', 'String', 'Ancestral Allele')
# Create an output file
with pysam.VariantFile(f"{output_prefix}.vcf.gz", 'w', header=vcf_in.header) as vcf_out:
# Iterate over all records (SNPs)
for record in vcf_in:
alleles = []
for sample in outgroup_samples:
genotype = record.samples[sample]['GT']
if genotype.count(None) > 1 or set(genotype) == {0, 1}:
# If the genotype is missing or heterozygous, mark this record as unassignable and break
alleles = []
break
allele = record.alleles[genotype[0]]
alleles.append(allele)
if len(set(alleles)) > 1:
# If more than one allele is found among the outgroup samples, assign AA=U
ancestral_allele = 'U'
elif len(alleles) == 0:
# If no alleles could be determined from the outgroup samples, assign AA=U
ancestral_allele = 'U'
else:
# Otherwise, the ancestral allele is the one found in the outgroup samples
ancestral_allele = alleles[0]
# Update the AA info
record.info['AA'] = ancestral_allele
vcf_out.write(record)
Script for full filtering:
It also creates simon martin’s invariant sites geno file, using MM1 filters on invariant sites.
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=10
#SBATCH --time=200:00:00
# mamba activate snps
# submit as for i in $(cat Chromosomes.list); do sbatch -J FILTER_${i} ~/merondun/cuculus_migration/snp_calling/4.SNP_Filtering.sh ${i}; done
CHR=$1
mkdir chromosome_vcfs chromosome_vcfs/stats
raw_vcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/merged
chr_map=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/momi2/chr_map.txt
#this bed contains GOOD regions which are neutral for demography
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/extracting_neutral_sites/neutral-sites_cuckoo__intergenic-intron-4fold.bed
#Subset samples
bcftools view --threads 10 --samples-file ~/merondun/cuculus_migration/relatedness/Sample_List_Unrelated.list --force-samples -Ou ${raw_vcfs}/${CHR}.vcf.gz | \
#change chr_1 to 1
bcftools annotate --threads 10 --rename-chrs ${chr_map} -Ou | \
bcftools norm -d both -Oz -o chromosome_vcfs/${CHR}_raw.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_raw.vcf.gz
#filter on DP, retain only sites within mean coverage + 2*sd or mean covearge - 2*sd
bcftools query -f '%CHROM\t%POS\t%INFO/DP\n' chromosome_vcfs/${CHR}_raw.vcf.gz > chromosome_vcfs/${CHR}_raw.dp_stats.txt
mean=$(cat chromosome_vcfs/${CHR}_raw.dp_stats.txt | datamash mean 3 | xargs printf "%.0f\n")
sd=$(cat chromosome_vcfs/${CHR}_raw.dp_stats.txt | datamash sstdev 3 | xargs printf "%.0f\n")
low=$(echo "$mean - 2*$sd" | bc)
high=$(echo "$mean + 2*$sd" | bc)
rm chromosome_vcfs/${CHR}_raw.dp_stats.txt
#subset SNPs, filter for MQ ONLY, fix chromosome name (stripping chr_), so new chromosome is simply 1
bcftools view --types snps --min-alleles 2 --max-alleles 2 --min-ac 1 --max-af 0.9999 --threads 10 -Oz -o chromosome_vcfs/${CHR}_snp.MQ.vcf.gz -i "QUAL > 20 & MQ > 30 & INFO/DP > ${low} & INFO/DP < ${high}" chromosome_vcfs/${CHR}_raw.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ.vcf.gz
#filter for >= 5x genotypes, apply 10% missingness filter
MINDP=5
bcftools +setGT -Ou chromosome_vcfs/${CHR}_snp.MQ.vcf.gz -- -t q -i "FMT/DP < ${MINDP}" -n "./." | \
#update AC fields after changing genotypes to missing
bcftools +fill-tags -Ou -- -t AC,AN | \
bcftools view --threads 10 --types snps --min-alleles 2 --max-alleles 2 --min-ac 1 --max-af 0.9999 -i 'F_MISSING < 0.1' -Oz -o chromosome_vcfs/${CHR}_snp.MQ-5X-MM1.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ-5X-MM1.vcf.gz
# Polarize alleles based on C. poliocephalus / C. micropterus shared derived alleles
python ~/merondun/gen_gen/general/Polarize_VCF_Add_AA.py ~/merondun/cuculus_migration/snp_calling/Outgroups.list chromosome_vcfs/${CHR}_snp.MQ-5X-MM1.vcf.gz chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A1
# Switch reference and ancestral allele, ensure --recalc is added to switch AC fields
java -jar ~/modules/jvarkit/dist/vcffilterjdk.jar -f ~/modules/jvarkit/dist/script.js --recalc chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A1.vcf.gz --output chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A2.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A2.vcf.gz
# Remove any alleles which aren't polarizable, and remove genotype fields which weren't switched (e.g. AD) because of ref allele switch
bcftools view --min-alleles 2 --max-alleles 2 --types snps --min-ac 1 --max-af 0.999 --threads 10 -i 'INFO/AA!="U"' chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A2.vcf.gz -Ob -o - | \
bcftools annotate -x ^FORMAT/GT,FORMAT/DP,FORMAT/GQ -o chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA.vcf.gz
rm chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A1.vcf.gz* chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-A2.vcf.gz*
# LD Prune
bedtools intersect -header -a chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA.vcf.gz -b $neutral | \
bcftools +prune -m 0.2 --window 50 -Oz -o chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50.vcf.gz
# And phase with beagle
java -Xmx40g -jar ~/modules/beagle.28Jun21.220.jar gt=chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50.vcf.gz out=chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50-phased nthreads=10 impute=true
bcftools index --threads 10 chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50-phased.vcf.gz
# Grab invariant sites, filter for MQ, DP (same thresholds), and missingness
echo "FILTERING AND MERGING INVARIANT SITES FOR ${CHR}"
bcftools view --max-ac 0 -i "MQ > 30 & INFO/DP > ${low} & INFO/DP < ${high} & F_MISSING < 0.1" --threads 10 -Ou chromosome_vcfs/${CHR}_raw.vcf.gz -Oz -o chromosome_vcfs/${CHR}_1N.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}_1N.vcf.gz
# re-merge the invariant and filtered SNPs
bcftools concat --threads 10 -Ob chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA.vcf.gz chromosome_vcfs/${CHR}_1N.vcf.gz | \
bcftools sort -Oz -o chromosome_vcfs/${CHR}.MQ-5X-MM1-AA.AllSites.vcf.gz
bcftools index --threads 10 chromosome_vcfs/${CHR}.MQ-5X-MM1-AA.AllSites.vcf.gz
#SNP Counts in tidy format
raw=$(bcftools index -n chromosome_vcfs/${CHR}_raw.vcf.gz)
mq=$(bcftools index -n chromosome_vcfs/${CHR}_snp.MQ.vcf.gz)
mm1=$(bcftools index -n chromosome_vcfs/${CHR}_snp.MQ-5X-MM1.vcf.gz)
aa=$(bcftools index -n chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA.vcf.gz)
ld=$(bcftools index -n chromosome_vcfs/${CHR}_snp.MQ-5X-MM1-AA-LDr2w50.vcf.gz)
total=$(bcftools index -n chromosome_vcfs/${CHR}.MQ-5X-MM1-AA.AllSites.vcf.gz)
echo -e "${CHR}\tRaw\t${raw}" > chromosome_vcfs/stats/${CHR}.snp.counts
echo -e "${CHR}\tMQ20\t${mq}" >> chromosome_vcfs/stats/${CHR}.snp.counts
echo -e "${CHR}\t5X_MM1\t${mm1}" >> chromosome_vcfs/stats/${CHR}.snp.counts
echo -e "${CHR}\tPolarized\t${aa}" >> chromosome_vcfs/stats/${CHR}.snp.counts
echo -e "${CHR}\tLD_R2_W50\t${ld}" >> chromosome_vcfs/stats/${CHR}.snp.counts
echo -e "${CHR}\tAllSites\t${total}" >> chromosome_vcfs/stats/${CHR}.snp.counts
#create simon's divergence input file
echo "CREATE SIMON MARTIN DIVERGENCE OUTPUT "
python ~/modules/genomics_general/VCF_processing/parseVCF.py --ploidyMismatchToMissing --ploidy 2 --skipIndels -i chromosome_vcfs/${CHR}.MQ-5X-MM1-AA.AllSites.vcf.gz | \
bgzip > chromosome_vcfs/${CHR}.geno.gz
rm chromosome_vcfs/${CHR}_1N.vcf.gz*
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=10
#SBATCH --time=200:00:00
# mamba activate snps
# submit as for i in $(cat Chromosomes.list); do sbatch -J FILTER_${i} ~/merondun/cuculus_migration/snp_calling/7.Subsample_Demography.sh ${i}; done
CHR=$1
#filtered vcfs directory, polarized alleles
vcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/chromosome_vcfs
outvcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/subsampled_n40_vcfs
mkdir $outvcfs
#this bed contains GOOD regions which are neutral for demography
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/extracting_neutral_sites/neutral-sites_cuckoo__intergenic-intron-4fold.bed
#this bed contains BAD sites which are repeats (bed does not include 'low_complexity' and 'simple_repeats', so it's largely TEs)
repeats=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
#Subset samples
bcftools view --threads 10 --samples-file ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024MAR13.list -Ov ${vcfs}/${CHR}.MQ-5X-MM1-AA.AllSites.vcf.gz | \
#RETAIN neutral sites
bedtools intersect -header -a - -b $neutral | \
#EXCLUDE repeats
bedtools subtract -header -a - -b $repeats | \
#re-apply 10% missing data threshold
bcftools view --threads 10 -i 'F_MISSING < 0.1' -Oz -o ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.vcf.gz
bcftools index --threads 10 ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.vcf.gz
#Phase VCF with beagle
java -jar -Xmx160g ~/modules/beagle.28Jun21.220.jar gt=${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.vcf.gz out=${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.tmp nthreads=8 window=40 overlap=2 impute=true
bcftools index --threads 10 ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.tmp.vcf.gz
#add the INFO DP,MQ and FMT/DP annotations back onto this VCF, from the pre-phased VCF
bcftools annotate --threads 10 -a ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.vcf.gz -c INFO/DP,INFO/MQ,FMT/DP ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.tmp.vcf.gz -Oz -o ${vcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.vcf.gz
bcftools index --threads 10 ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.vcf.gz
rm ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.tmp.vcf.gz ${outvcfs}/${CHR}.MQ-5X-MM1-AA.AllSites-Neutral-NonRepetitive.Phased.tmp.vcf.gz.csi
I will also prune for LD and do a PCA / ADMIXTURE on sample subsets:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=20
#SBATCH --time=48:00:00
# submit as: for GROUP in $(cat GROUPS.list); do sbatch -J MERGE_${i} ~/merondun/cuculus_migration/admixture/1.Subset_Subgroups.sh ${GROUP}; done
# mamba activate snps
GROUP=$1
mkdir autosomal_files
#intergenic and 4-fold degenerate sites
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/extracting_neutral_sites/neutral-sites_cuckoo__intergenic-intron-4fold.bed
# Merge the VCFs into a single autosomal SNP set
bcftools concat --threads 20 -Ou chromosome_vcfs/*AA.vcf.gz | \
#subset samples
bcftools view --threads 20 -Ou --force-samples --samples-file ~/merondun/cuculus_migration/relatedness/${GROUP}_Unrelated.list | \
#ensure only variant SNPs are left
bcftools view --threads 20 -Ov --min-alleles 2 --max-alleles 2 --min-af 0.05 --max-af 0.999 -i 'F_MISSING < 0.1' | \
#intersect with neutral sites
bedtools intersect -header -a - -b $neutral | \
#prune LD
bcftools +prune -m 0.2 --window 50 -Oz -o autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50.vcf.gz
bcftools index --threads 20 autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50.vcf.gz
#Create plink bed files for admixture + perform pca
~/modules/plink2 --threads 20 --vcf autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50.vcf.gz --chr-set 29 --allow-extra-chr --set-missing-var-ids @:# \
--make-bed --recode vcf bgz --pca --out autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50
Create PCA and run ADMIXTURE on the full dataset:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=10
#SBATCH --time=200:00:00
# mamba activate snps
# submit as: for GROUP in $(cat GROUPS.list); do for K in {2..10}; do sbatch -J ADMIX_${GROUP}_${K} ~/merondun/cuculus_migration/admixture/2.ADMIXTURE.sh ${GROUP} ${K}; done ; done
GROUP=$1
K=$2
mkdir -p admixture/unrelated_n261/${GROUP}
cd admixture/unrelated_n261/${GROUP}
#Run Admixture
admixture -j7 --cv=5 ../../autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50.bed ${K} > ${GROUP}.MQ-5X-MM1-AA-LDr2w50.log${K}.out
#Evaluate Runs
~/modules/evalAdmix/evalAdmix -plink ../../autosomal_files/${GROUP}.MQ-5X-MM1-AA-LDr2w50 -fname ${GROUP}.MQ-5X-MM1-AA-LDr2w50.${K}.P -qname ${GROUP}.MQ-5X-MM1-AA-LDr2w50.${K}.Q -P 10 -o eval_${GROUP}.MQ-5X-MM1-AA-LDr2w50_${K}
Grab 5K gene trees with 6670 OTUs using the ‘Sequenced Species’ dataset of Hackett from BirdTree
Includes 126 of the Cuculiformes species:
Stage 2 MayrParSho Hackett: 10K trees with 6670 OTUs [10k sampled]
Job ID: tree-pruner-3972b4dc-d4e3-42db-8473-30777033c335
Estimate species tree from the hackett trees:
# First estimate species tree
java -jar ~/modules/ASTRAL/astral.5.7.7.jar -i stage2_mayrparshohackett_10k.tre -o stage2_mayrparshoahackett_10k_species_tree.tre
# Estimate concordance factors using the species tree against the 10K hackett trees
iqtree -t stage2_mayrparshohackett_10k_species_tree.tre --gcf stage2_mayrparshohackett_10k.tre --prefix stage2_mayrparshohackett_10k_concord
| Route | Nonbreeding | Description |
|---|---|---|
| S | I | Sedentary within Indo-Malay / Australia |
| S | SA | Sedentary within the Americas |
| S | A | Sedentary within Africa |
| II | I | Migration within Indo-Malay / Australia |
| SS | SA | Migration within South America |
| AA | A | Migration within Africa |
| APw | A | Africa - Palearctic migration, western route |
| APe | A | Africa - Palearctic migration, eastern route |
| APwe | A | Africa - Palearctic migration, polymorphic route |
| NS | SA | North - South America migration |
Using that concordance tree, estimate ancestral state changes:
#### Plot & estimate species tree
setwd('~/EvoBioWolf/CUCKOO_migration/reconstruction/')
.libPaths('~/mambaforge/envs/R/lib/R/library')
library(ggtree)
library(ape)
library(tidyverse)
library(treeio)
library(viridis)
library(ggpubr)
library(RColorBrewer)
library(phytools)
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(magick)
# # Only run this the first time to convert the nex trees
# # Read in nexus
# trees <- read.nexus('hackett_sequenced_species.nex')
#
# # Re-write trees
# write.tree(trees, file = "hackett_sequenced_species.tre")
# Read in iqtree
iqtree <- read.iqtree('species.tre')
iqtree_root <- treeio::root(as.phylo(iqtree),outgroup='Geococcyx_californianus',resolve.root=TRUE)
gg <- ggtree(iqtree_root, layout = "rectangular")
gg
gg$data <- gg$data %>% mutate(label = gsub('.*\\/','',label))
gg +
geom_tippoint()+
geom_tiplab(align = TRUE)+
#xlim(c(min(gg$data$x)-.05,max(gg$data$x)*1.3))+
geom_nodelab(geom='text',mapping=aes(label=label),col='black',size=3,show.legend=F,vjust=-1,hjust=1.15)+
geom_treescale(y=10,x=0.01)+
theme(legend.position = 'none')
# Generate base tree
md <- read.table('Species_Data.txt',header=TRUE)
targ_tree <- as.phylo(iqtree_root)
ggt <- ggtree(targ_tree, layout = "rectangular",branch.length='none') %<+% md
#grab only residency
phenos <- as.data.frame(ggt$data %>% filter(isTip == TRUE))
mat <- as.matrix(phenos %>% select(Residency))
phenotypes <- setNames(as.vector(mat[, 1]), targ_tree$tip.label)
#Plot probabilities
t2 <- multi2di(targ_tree)
t2$edge.length[is.na(t2$edge.length)] <- 1e-8
# Quick ape method
fitER <- ape::ace(phenotypes,t2,model="ER",type="discrete")
# Extract nodes and the proportions for pies
nodes <- data.frame(
node=1:t2$Nnode+Ntip(t2),
fitER$lik.anc)
t2$node.label <- gsub('.*\\/','',t2$node.label)
tip_rename <- left_join(data.frame(label=t2$tip.label),md) %>%
mutate(lab = paste0(Genus,'_',Species))
t2$tip.label <- tip_rename$lab
#### Main plot, node labels < 100 bootstrap support
pdf('20250115_ReconstructionER-Cuculiformes_Residency-Node80.pdf',height=7,width=7)
cols<-setNames(viridis(3)[1:length(unique(phenotypes))],sort(unique(phenotypes)))
plotTree(t2,ftype="off")
#Node points
node_labs <- ifelse(round(as.numeric(t2$node.label)) >= 80, NA, '*')
nodelabels(node_labs,node=1:t2$Nnode+Ntip(t2),cex=0.7,bg=NA,adj = c(0.5,-.5))
nodelabels(node=1:t2$Nnode+Ntip(t2),
pie=fitER$lik.anc,piecol=cols,cex=0.4)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.5,
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
##### Supp plot, show tip points, no node labels
png('20250115_ReconstructionER-Cuculiformes_Residency-TIPS.png',height=7,width=7,units='in',res=300)
cols<-setNames(viridis(3)[1:length(unique(phenotypes))],sort(unique(phenotypes)))
plotTree(t2,ftype="off")
# Tip points
tip_colors <- cols[phenotypes]
tiplabels(pch=24, bg=tip_colors, col="black", cex=1, adj = c(2.5, 0.5))
#Node points
node_labs <- ifelse(round(as.numeric(t2$node.label)) == 100, NA, round(as.numeric(t2$node.label)))
nodelabels(node=1:t2$Nnode+Ntip(t2),
pie=fitER$lik.anc,piecol=cols,cex=0.4)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.5,
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
#### Tip labels
pdf('20250115_ReconstructionER-Cuculiformes_Residency-TIPLABELS.pdf',height=7,width=9)
cols<-setNames(viridis(3)[1:length(unique(phenotypes))],sort(unique(phenotypes)))
plotTree(t2,fsize=0.8,ftype="i")
tiplabels(pie=to.matrix(phenotypes,sort(unique(phenotypes))),piecol=cols,cex=0.05)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.5,
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
##### Migration route reconstruction ####
#grab only route
mat2 <- as.matrix(phenos %>% select(Route))
phenotypes2 <- setNames(as.vector(mat2[, 1]), targ_tree$tip.label)
# Quick ape method
fitER2 <- ape::ace(phenotypes2,t2,model="ER",type="discrete")
# Extract nodes and the proportions for pies
nodes2 <- data.frame(
node=1:t2$Nnode+Ntip(t2),
fitER2$lik.anc)
#### Plot nodes and node labels
pdf('20250115_ReconstructionER-Cuculiformes_Route-Node80-Tall.pdf',height=7,width=3.5)
cols<-setNames(viridis(7,option='turbo')[1:length(unique(phenotypes2))],sort(unique(phenotypes2)))
cols <- setNames(c('#327739','#b5d887','#5db465','darkorchid2','#b54325','white','grey90','grey50')[1:length(unique(phenotypes2))],sort(unique(phenotypes2)))
t2_clado <- compute.brlen(t2, method = "equal") # Assign equal branch lengths
plotTree(t2_clado,ftype="off")
tip_colors2 <- cols[phenotypes2]
tiplabels(pch=22, bg=tip_colors2, col="black", cex=0.5,adj=c(0,0))
node_labs <- ifelse(round(as.numeric(t2$node.label)) >= 80, NA, '*')
nodelabels(node_labs, node=1:t2$Nnode+Ntip(t2), cex=1, bg=NA, adj=c(0.5, 0), frame="none")
nodelabels(node=1:t2$Nnode+Ntip(t2),
pie=fitER2$lik.anc,piecol=cols,cex=0.8)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.9*par()$usr[1],
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
#### Plot TIP SHAPES
png('20250115_ReconstructionER-Cuculiformes_Route-TIPS.png',units='in',res=300,height=7,width=7)
cols<-setNames(viridis(7,option='turbo')[1:length(unique(phenotypes2))],sort(unique(phenotypes2)))
plotTree(t2,ftype="off")
# Tip points
tip_colors2 <- cols[phenotypes2]
tiplabels(pch=24, bg=tip_colors2, col="black", cex=1, adj = c(2.5, 0.5))
node_labs <- ifelse(round(as.numeric(t2$node.label)) == 100, NA, round(as.numeric(t2$node.label)))
nodelabels(node=1:t2$Nnode+Ntip(t2),
pie=fitER2$lik.anc,piecol=cols,cex=0.4)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.9*par()$usr[1],
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
# Tip labels
pdf('20250115_ReconstructionER-Cuculiformes_Route-TIPLABELS.pdf',height=7,width=7)
cols<-setNames(viridis(7,option='turbo')[1:length(unique(phenotypes2))],sort(unique(phenotypes2)))
plotTree(t2,fsize=0.8,ftype="i")
tiplabels(pie=to.matrix(phenotypes2,sort(unique(phenotypes2))),piecol=cols,cex=0.1)
add.simmap.legend(colors=cols,prompt=FALSE,x=0.9*par()$usr[1],
y=max(nodeHeights(t2)-50),fsize=2)
dev.off()
# For plotting with ggtree
## cols parameter indicate which columns store stats
residency_colors <- data.frame(cols=viridis(3),levs=names(nodes[,2:4]))
pies <- nodepie(nodes, cols=2:4,outline.color='black',outline.size = 0.1)
pies <- lapply(pies, function(g) g+scale_fill_manual(values = residency_colors$cols,breaks=residency_colors$levs))
t3 <- full_join(t2, data.frame(label = names(phenotypes), stat = phenotypes ), by = 'label')
tp <- ggtree(t3,layout='rectangular') %<+% md
tp$data$dummy <- 1
tp_final <- tp + geom_inset(pies, width = .09, height = .09)
tp_phenos <- tp_final +
geom_tippoint(aes(fill=Residency),pch=21,size=1.5)+
scale_fill_manual(values = residency_colors$cols,breaks=residency_colors$levs)
tp_phenos
ggsave('20250115_Cuculiformes-Residency-ggtree.pdf',tp_phenos,height=15,width=15,dpi=300)
Resubsetting samples:
GROUP=Canorus
GROUP=Optatus
~/modules/plink2 \
--bfile CanorusOptatus-n261.MQ-5X-MM1-AA-LDr2w50 \
--chr-set 29 \
--allow-extra-chr \
--set-missing-var-ids @:# \
--keep ~/merondun/cuculus_migration/admixture/${GROUP}.pid \
--geno 0.10 \
--maf 0.05 \
--make-bed \
--recode vcf bgz \
--pca \
--out ${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50
Ensure sample names don’t have any weird plink changes:
bcftools reheader --samples N261.renamed --threads 10 --output CanorusOptatus-n261.MQ-5X-MM1-AA-LDr2w50.renamed.vcf.gz CanorusOptatus-n261.MQ-5X-MM1-AA-LDr2w50.vcf.gz
bcftools query -l CanorusOptatus-n261.MQ-5X-MM1-AA-LDr2w50.vcf.gz | sed 's/_M_.*/_M/g' | sed 's/_F_.*/_F/g' > N261.renamed
Run admixture:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=10
#SBATCH --time=200:00:00
# mamba activate snps
# submit as: for GROUP in $(cat GROUPS.list); do for K in {2..10}; do sbatch -J ADMIX_${GROUP}_${K} ~/merondun/cuculus_migration/admixture/2.ADMIXTURE.sh ${GROUP} ${K}; done ; done
GROUP=$1
K=$2
wd=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/unrelated_n261/${GROUP}
bedfiles=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/autosomal_files
mkdir -p ${wd}
cd ${wd}
#Run Admixture
admixture -j7 --cv=5 ${bedfiles}/${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50.bed ${K} > ${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50.log${K}.out
#Evaluate Runs
~/modules/evalAdmix/evalAdmix -plink ${bedfiles}/${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50 -fname ${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50.${K}.P -qname ${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50.${K}.Q -P 10 -o eval_${GROUP}-n261.MQ-5X-MM1-AA-LDr2w50_${K}
The follow R script will make these plots:
evalAdmix plots
structure plots
tesselation plots for K = 2
tesselation plots with pie charts for K = 2 - 5
blank map plot with the study extent
map with the sampling of the population genetic samples
### Plot ADMIXTURE
setwd('~/merondun/cuculus_migration/admixture/full_qs/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(terra)
library(tidyverse)
library(viridis)
library(stringr)
library(meRo) #devtools::install_github('merondun/meRo')
library(LEA) #install with bioconductor, you don't actually need this if you impot your own q-matrix
library(tess3r) #install with github devtools
library(rworldmap) #for ggplot mapping
library(sf) #for spatial plotting of distance vectors to confirm
library(ggspatial) #to add scale bars onto maps
library(ggpubr)
library(RColorBrewer)
library(ggnewscale)
library(scales)
prefixes = gsub('.2.Q','',list.files('.',pattern='.*.2.Q'))
prefixes = c("Canorus-n261.MQ-5X-MM1-AA-LDr2w50","Optatus-n261.MQ-5X-MM1-AA-LDr2w50")
qdir = '.' #directory with Q files
counter = 0
cols = brewer.pal(8,'Paired')[c(3,4,7,8)]
for (admix_run in prefixes) {
counter = counter + 1
cat('Working on run: ',admix_run,'\n')
#Set if else parameters for colors, species, and shape based on the prefix string
if(grepl('Canorus',admix_run)){
colz=cols[c(2,1)]; sp='CC'; spshape=21; filt='Cuculus canorus'
} else {
colz=cols[c(3,4)]; sp='CO'; spshape=24; filt = 'Cuculus optatus'
}
prefix = admix_run
admix = melt_admixture(prefix = prefix, qdir = qdir)
#read in metadata
md = read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
# Change optatus east to just gray
md <- md %>% mutate(Population = ifelse(Population == 'COE',NA,Population),
PopColor = ifelse(Population=='COE','grey60',PopColor))
# Assign groups based on the n=40 demographic inference subset
n40 = read_tsv('~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024MAR13.pop',col_names = F)
names(n40) = c('ID','Group')
md = md %>% left_join(.,n40) %>% mutate(GroupBinary = ifelse(is.na(Group),'Other','Target'),
Group = ifelse(is.na(Group),'Other',Group))
# Only retain unrelated samples, no toepads
md_sub = md %>% filter(SpeciesShort == sp)
#md = read_tsv('~/merondun/cuculus_migration/ebird/Metadata_Breeding_2024APR24.txt')
admixmd = left_join(md_sub,admix)
# Reorder individuals baseed on longitude
admixmd = admixmd %>% mutate(ID = fct_reorder(ID,Longitude))
#loop through all admixture runs and extract the average correlation values from evalAdmix, we want to MINIMIZE this! (closest to 0)
evaldat = NULL; for (Kval in seq(2,10,1)){
r <- as.matrix(read.table(paste0("eval_",admix_run,'_',Kval)))
mean_value <- mean(r,na.rm=TRUE)
median_value <- median(r,na.rm=TRUE)
sd_value <- sd(r,na.rm=TRUE)
iqr_value <- IQR(r,na.rm=TRUE)
valdat = data.frame(K = Kval,mean = mean_value,median=median_value,sd=sd_value,iqr=iqr_value)
evaldat = rbind(valdat,evaldat)
}
#plot, for main figure show the n=3 lowest median
targs = evaldat %>% slice_min(abs(median),n=3)
ep = evaldat %>%
ggplot(aes(x=K,y=median,ymin=median-iqr,ymax=median+iqr))+
geom_rect(data=targs,aes(xmin=K-0.25,xmax=K+0.25,ymin=-Inf,ymax=Inf),fill='darkseagreen3')+
geom_text(aes(y = 0.014, label = format(signif(median, 2), scientific = TRUE)),size=2) + ylim(c(-0.015,0.015))+
geom_hline(yintercept=0,lty=2)+
geom_point()+ylab('Median +/- IQR Correlation of Residuals') +
geom_errorbar()+
theme_bw() +
scale_x_continuous(breaks = seq(min(evaldat$K), max(evaldat$K), by = 1)) +
coord_flip()
ep
assign(paste0('e',1),ep)
ggsave(paste0('~/merondun/cuculus_migration/figures/20250115_evalAdmix_',sp,'.pdf'),e1,height=5,width=6,dpi=600)
#Now plot ADMIXTURE
adplot =
admixmd %>% filter(Specified_K == 2 ) %>% #specify the levels you want
mutate(Specified_K = paste0('K',Specified_K)) %>%
ggplot(aes(x = factor(ID), y = Q, fill = factor(K), col=GroupBinary)) +
geom_col(size = 0.1) +
facet_grid(Specified_K~Species, scales = "free", space = "free") +
theme_minimal(base_size=6) + labs(x = "",y = "") +
scale_y_continuous(expand = c(0, 0),n.breaks = 3) +
scale_fill_manual(values=colz)+
scale_color_manual(values=c('gray','black'))+
theme(
panel.spacing.x = unit(0.1, "lines"),
#axis.text.x = element_blank(),
axis.text.x=element_text(angle=90,size=5),
axis.text.y = element_text(size=3),
panel.grid = element_blank(),
legend.position = 'bottom',
plot.title = element_text(size=6)
)
adplot
assign(paste0('p',counter),adplot)
ggsave(paste0('~/merondun/cuculus_migration/figures/20250115_Admixture_',sp,'.pdf'),adplot,height=2.5,width=6,dpi=600)
#save the K1/K2 Q values:
qval = admixmd %>% filter(Specified_K == 2 & K == 'K1') %>% select(ID,K1 = Q)
write.table(qval,paste0('~/merondun/cuculus_migration/admixture/Assigned_K2_Qvalues_',prefix,'.txt'),quote=F,sep='\t',row.names=F)
### Tesselation
# Import birdlife shapefiles, breeding == 2, presence ==1 means extant
bg <- st_read('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/spatial/birdlife_breeding_distributions/SppDataRequest.shp')
filtered_data <- bg[bg$PRESENCE == 1 & bg$SEASONAL == 2 & bg$SCI_NAME == filt, ]
#which K value to plot
show_k = 2
# Import Q, and then merge with the fam file to get the IDs to retain
show_q = read.table(paste0(qdir,'/',admix_run,'.',show_k,'.Q')) #read in the specific file
fam = read_tsv(paste0(prefix,'.fam'),col_names = F) %>% select(X2) %>% dplyr::rename(ID = X2)
all_q = cbind(show_q,fam)
# This will only retain the individuals from the admixture lot
retain_input = left_join(admixmd %>% select(ID,Group,Latitude,Longitude) %>% unique,all_q)
show_q_mat = as.matrix(retain_input %>% select(V1,V2)) #convert it to a matrix
class(show_q_mat) = c('tess3Q','matrix','array') #make sure tess3r thinks that it's actually a tess object
coords = retain_input %>% select(Longitude,Latitude) #convert lat and long
coords_mat = as.matrix(coords) #convert coordinates to matrix
#plot using ggplot
map.polygon <- getMap(resolution = "high")
# Jitter the lat/long
sitesp = st_as_sf(retain_input %>% select(ID,Group,Longitude,Latitude) %>% distinct %>%
mutate(loj = jitter(Longitude,amount=0.5),laj = jitter(Latitude,amount=0.5)),remove = F, coords = c("loj", "laj"), crs = 4326, agr = "constant")
sitesp = sitesp %>% mutate(Kept = ifelse(Group == 'Other','Stronghold','Other'))
pl = ggtess3Q(show_q_mat, coords_mat, map.polygon = filtered_data,col.palette = colz)
k2p1 = pl +
geom_path(data = map.polygon, aes(x = long, y = lat, group = group),col='white',lwd=0.2) +
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
new_scale_fill()+
new_scale_color()+
geom_point(data = sitesp, aes(x = loj, y = laj,fill=Group,alpha=Kept),shape=spshape, size = 2,col='black') +
scale_fill_manual(values=md$PopColor,breaks=md$Population)+
scale_alpha_manual(values=c(0.8,0.2))+
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
k2p1
ggsave(paste0('~/merondun/cuculus_migration/figures/20250115_Tesselation_',sp,'.pdf'),k2p1,height=4,width=7,dpi=300)
#plot all K...
maximum_k = 5
for (kval in seq(2,maximum_k,1)) {
cat('Working on K = ',kval,'\n')
# Import Q, and then merge with the fam file to get the IDs to retain
show_q = read.table(paste0(qdir,'/',admix_run,'.',kval,'.Q')) #read in the specific file
fam = read_tsv(paste0(prefix,'.fam'),col_names = F) %>% select(X2) %>% dplyr::rename(ID = X2)
all_q = cbind(show_q,fam)
# This will only retain the individuals from the admixture lot
retain_input = left_join(admixmd %>% select(ID,Group,Latitude,Longitude) %>% unique,all_q)
show_q_mat = as.matrix(retain_input %>% select(matches('V'))) #convert it to a matrix
class(show_q_mat) = c('tess3Q','matrix','array') #make sure tess3r thinks that it's actually a tess object
coords = retain_input %>% select(Longitude,Latitude) #convert lat and long
coords_mat = as.matrix(coords) #convert coordinates to matrix
# First, grab the individuals and calculate the mean Q values within each cluster. Cluster will be geographic reigon, also calculate mean lat/long for plotting
kept = admixmd %>% select(ID,Specified_K,K,Q,Latitude,Longitude,Group = GeographicGroup)
group_summaries = kept %>%
filter(Specified_K == kval) %>%
group_by(Group,K) %>% #within each group and K, average lat/long/q and count number of individuals
summarize(Lat = mean(Latitude),
Long = mean(Longitude),
Q = mean(Q),
N = n_distinct(ID)) %>%
ungroup() %>% #
#Calculate scaling factors for the pies based on num samples
mutate(MinN = min(N),
MaxN = max(N)) %>%
group_by(Group) %>%
mutate(Scaling_factor = ((N - MinN) / (MaxN - MinN) * 10) + 2) %>%
select(-MinN, -MaxN)
##### Plot pies across the world
plot_pie <- function(data) {
ggplot(data, aes(x = "", y = Q, fill = K,)) +
geom_bar(col='white',lwd=0.5,width = 1, stat = "identity") +
coord_polar("y") +
scale_fill_viridis(discrete=TRUE)+
theme_void() +
theme(legend.position = "none")
}
#set up map and make a sf object from the summaries
sites = st_as_sf(group_summaries, coords = c("Long", "Lat"), crs = 4326, agr = "constant")
# Main map plot
p =
ggtess3Q(show_q_mat, coords_mat, map.polygon = filtered_data,col.palette = viridis(kval)) +
geom_path(data = map.polygon, aes(x = long, y = lat, group = group)) +
geom_sf(data = sites, aes(geometry = geometry), size = 0.1, alpha = 0.1, pch=26) +
xlab('')+ylab('')+
coord_sf(xlim = c(min(group_summaries$Long)-5, max(group_summaries$Long)+5),
ylim = c(min(group_summaries$Lat)-5, max(group_summaries$Lat)+5), expand = FALSE)+
theme_classic(base_size = 8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
p
# Add pies
for (i in unique(group_summaries$Group)) {
subset_data = group_summaries %>% filter(Group == i)
lon = unique(subset_data$Long)
lat = unique(subset_data$Lat)
scale_factor = unique(subset_data$Scaling_factor)
cat('Scaling factor is : ',scale_factor,' for group : ',i,'\n')
pie = plot_pie(subset_data)
p <- p + annotation_custom(ggplotGrob(pie),
xmin = lon - scale_factor,
xmax = lon + scale_factor,
ymin = lat - scale_factor,
ymax = lat + scale_factor)
}
p
assign(paste0('t',kval),p)
}
png(paste0('~/merondun/cuculus_migration/figures/20250115_Tesselation_',admix_run,'_AllK.png'),res=600,units='in',height=7,width=10)
print(ggarrange(t2,t3,t4,t5,ncol=2,nrow=2))
dev.off()
}
###### Blank plot ######
#Create a small psuedo region for plotting the tesselation, makes a small file size
coords <- matrix(c(50, 60, 51, 60, 51, 61, 50, 61, 50, 60), ncol = 2, byrow = TRUE)
polygon <- st_polygon(list(coords))
sf_df <- st_sf(geometry = st_sfc(polygon))
#plot using ggplot
map.polygon <- getMap(resolution = "high")
b = ggtess3Q(show_q_mat, coords_mat, map.polygon = sf_df,col.palette = rev(cols))
blank = b +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
new_scale_fill()+
new_scale_color()+
geom_point(data = sitesp, aes(x = loj, y = laj,fill=Group,alpha=Kept),shape=26,size = 2,col='black') +
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
blank
ggsave(paste0('~/merondun/cuculus_migration/figures/20241023_Tesselation_Blank.pdf'),blank,height=4,width=7,dpi=300)
#### Add individual points ####
sitesp = st_as_sf(md %>% filter(Analysis_ADMIXTURE_All == 1) %>% select(ID,Group,Longitude,Latitude,SpeciesShort) %>% distinct %>% mutate(loj = jitter(Longitude,amount=1),laj = jitter(Latitude,amount=1)),remove = F, coords = c("loj", "laj"), crs = 4326, agr = "constant")
sitesp = sitesp %>% mutate(Kept = ifelse(Group == 'Other','Stronghold','Other'))
pos = b +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
new_scale_fill()+
new_scale_color()+
geom_point(data = sitesp, aes(x = loj, y = laj,fill=Group,alpha=Kept,shape=SpeciesShort), size = 2,col='black') +
scale_fill_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","white" ))+
xlab("Longitude") + ylab("Latitude") +
scale_shape_manual(values=c(21,24))+
guides(fill=guide_legend(nrow=2,override.aes=list(shape=22)))+
scale_alpha_manual(values=c(0.8,0.4))+
theme_classic(base_size=8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
pos
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/PopGenLocations_Map_Tesselation_2024MAY14.pdf',height=4,width=7)
pos
dev.off()Structure plots for canorus and optatus, run separately
Inferred tesselations of ancestry projected across geographic space for K=2, separately for canorus and optatus, and clipped to birdlife international’s breeding extant range.
Tesselations for K2 - K5 with pie charts showing proportions of ancestry within each region
setwd('~/merondun/cuculus_migration/admixture/')
library(tidyverse)
library(RColorBrewer)
# Reorder individuals baseed on longitude
md <- read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
md <- md %>% mutate(Population = ifelse(Population == 'COE',NA,Population),
PopColor = ifelse(Population=='COE','grey60',PopColor))
groups = c('CanorusOptatus','Canorus','Optatus')
for (group in groups) {
#read PCA data
vec = read.table(paste0(group,'-n261.MQ-5X-MM1-AA-LDr2w50.eigenvec'),header=TRUE,comment.char = '')
vec = vec %>% dplyr::rename(ID = X.FID) %>% select(-IID)
val = read.table(paste0(group,'-n261.MQ-5X-MM1-AA-LDr2w50.eigenval'),header=FALSE,comment.char = '')
val = val %>% mutate(VE = paste0(round(V1/sum(V1),2)*100,'%'))
admixmd = left_join(vec,md) %>% mutate(ID = fct_reorder(ID,Longitude)) %>% mutate(Population = ifelse(is.na(Population),'Unassigned',Population))
pca_plot <- admixmd %>%
ggplot() +
geom_point(aes(x=PC1,y=-PC2,fill = Population,shape=Species_Latin,alpha=as.factor(Analysis_Demography)),
size = 2, color = 'grey20') +
theme_bw(base_size=8) +
scale_fill_manual(values=md$PopColor,breaks=md$Population)+
scale_shape_manual(values=md$Shape,breaks=md$Species_Latin)+
scale_alpha_manual(values=c(0.25,0.9))+
xlab(paste0('PC1 (',val[1,2],')'))+
ylab(paste0('PC2 (',val[2,2],')'))+
guides(fill = guide_legend(override.aes = list(shape = 21)))+
coord_flip()
pca_plot
ggsave(paste0('../figures/20250115_PCA_',group,'.pdf'),pca_plot,height=2,width=4,dpi=300)
}
#### Calculate pairwise geographic distance between distance groups
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/isolation_by_distance')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(viridis)
library(geosphere)
library(sf)
library(ggspatial)
library(RColorBrewer)
#Read in metadata
md <- read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
#groups for fst: distance ~ autos
analyze_these = md %>%
filter(Analysis_ADMIXTURE_All == 1) %>%
select(GeographicGroup) %>%
group_by(GeographicGroup) %>%
mutate(DistanceHaps = n()) %>%
unique %>%
filter(DistanceHaps >= 3) %>% pull(GeographicGroup)
#calculate geographic distance between the groups
dists <- md %>% filter(Analysis_ADMIXTURE_All == 1 & GeographicGroup %in% analyze_these) %>% group_by(GeographicGroup) %>% summarize(meanLat = mean(Latitude),meanLong = mean(Longitude))
dists <- dists %>% mutate(GCol = rep_len(c(viridis(9, option = 'turbo'),rev(viridis(9, option = 'turbo'))), 21),
GShape = rep_len(c(21, 24, 25, 22), 21))
dists %>% select(GCol,GShape) %>% unique %>% nrow
#plot
world = map_data("world")
sites = st_as_sf(dists, coords = c("meanLong", "meanLat"),
crs = 4326, agr = "constant")
fst_compars = ggplot() +
geom_polygon(data = world, aes(x = long, y = lat, group = group),col='grey90',fill='white') +
geom_sf(data = sites,
aes(fill=GeographicGroup,shape=GeographicGroup),
size=4,alpha=0.9,show.legend = T,stroke=0.5) +
scale_shape_manual(values=dists$GShape,breaks=dists$GeographicGroup)+
scale_fill_manual(values=dists$GCol,breaks=dists$GeographicGroup)+
xlab('')+ylab('')+
coord_sf(xlim = c(min(dists$meanLong)-5, max(dists$meanLong)+5),
ylim = c(min(dists$meanLat)-5, max(dists$meanLat)+5), expand = FALSE)+
theme_classic()+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))+
annotation_scale(line_width=0.5)
fst_compars
ggsave('../figures/20250121_FST_IBD_Populations.pdf',
fst_compars,height=4,width=9,dpi=600)
# initialize an empty data frame to store pairwise distances
pairwise_distances <- data.frame()
sub_data = dists
# calculate pairwise distances for the current 'W' GeographicGroup
for(i in 1:(nrow(sub_data) - 1)) {
for(j in (i + 1):nrow(sub_data)) {
point1 <- c(sub_data$meanLong[i], sub_data$meanLat[i])
point2 <- c(sub_data$meanLong[j], sub_data$meanLat[j])
distance_km <- distHaversine(point1, point2) / 1000 # convert to km
# append the result to the pairwise_distances data frame
pairwise_distances <- rbind(pairwise_distances,
data.frame(P1 = sub_data$GeographicGroup[i],
P2 = sub_data$GeographicGroup[j],
Distance_km = distance_km))
}
}
pairwise_distances = pairwise_distances %>%
mutate(Group = paste0(P1,'__',P2)) %>%
separate(P1,into=c('S1','G1'),remove=F) %>%
separate(P2,into=c('S2','G2'),remove=F) %>%
filter(S1 == S2) %>% select(P1,P2,Distance_km,Group)
# show the calculated pairwise distances
write_tsv(pairwise_distances,file='Pairwise_GeographicDistance_Km.txt')
#write out populations
for (pop in unique(analyze_these)) {
su = md %>% filter(Analysis_ADMIXTURE_All == 1 & GeographicGroup == pop)
write.table(su$ID,file=paste0('populations/',pop,'.pop'),quote=F,sep='\t',row.names=F,col.names=F)
}
Comparisons:
awk '{print $4}' Pairwise_GeographicDistance_Km.txt | sed '1d' > PairwiseComparisons.list
cat PairwiseComparisons.list
CC_5__CC_9
CC_6__CC_7
CC_6__CC_9
CC_7__CC_9
CO_10__CO_2
CO_10__CO_4
CO_10__CO_7
Calculate FST:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=2
#SBATCH --time=48:00:00
# mamba activate snps
# for i in $(cat PairwiseComparisons.list); do sbatch -J FST_${i} 2.Pairwise_Distance_FST.sh ${i}; done
GROUP=$1
VCF=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/autosomal_files/CanorusOptatus-n261.MQ-5X-MM1-AA-LDr2w50.vcf.gz
wd=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/isolation_by_distance
for CHR in $(cat Chromosomes.list); do
echo "WORKING ON CHR: ${GROUP} and ${CHR}"
p1=$(echo ${GROUP} | sed 's/__.*//g')
p2=$(echo ${GROUP} | sed 's/.*__//g')
mkdir -p work out
#calculate fst
~/modules/vcftools/bin/vcftools --gzvcf ${VCF} --fst-window-size 100000 --fst-window-step 100000 --weir-fst-pop populations/${p1}.pop --weir-fst-pop populations/${p2}.pop --out work/${CHR}_${GROUP}
awk -v g=${GROUP} '{OFS="\t"}{print $1, $2, $3,$4,$5, g}' work/${CHR}_${GROUP}.windowed.weir.fst > out/${CHR}_${GROUP}.fst
fi
done
Merge output:
mergem *fst > ../20250122_Pairwise_FST_DistanceGroups_Autosomes.txt
Plot final correlations:
#### Plot Distance ~ FST
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/isolation_by_distance')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(viridis)
library(geosphere)
library(broom)
library(ggpubr)
library(lme4)
library(lmerTest)
library(broom.mixed)
library(meRo)
library(RColorBrewer)
#Read in metadata
md = read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
d = read_tsv('20250122_Pairwise_GeographicDistance_Km.txt')
f = read_tsv('20250122_Pairwise_FST_DistanceGroups_Autosomes.txt')
#prep and merge frames
names(f) = c('chr','start','end','snps','FST','Group')
f2 = f %>% mutate(FST = pmax(0, pmin(1, FST))) %>% select(!c(start,end,snps))
#merge with geographic distance
df = left_join(f2,d)
dfs = df %>% group_by(Group,P1,P2,Distance_km) %>%
sum_stats(FST)
dfs %>% ggplot(aes(x=log(Distance_km),y=log(mean)))+
geom_point()+
geom_smooth()+
theme_bw()
dfs <- dfs %>% mutate(Species = ifelse(grepl('CO',Group),'C. optatus','C. canorus'))
# Spearman's rho and cor test within each AvZ level and gather coefficients
cors = dfs %>% group_by(Species) %>%
summarize(
rho = cor(mean, Distance_km,method='spearman'),
p_value = cor.test(mean, Distance_km,method='spearman')$p.value) %>%
mutate(padj = p.adjust(p_value,method='bonferroni'),
signif = case_when(
padj < 0.05 ~ "*",
TRUE ~ " (n.s.)"),
label = paste0('rs: ',round(rho, 2), signif))
cors
# Species rho p_value padj signif label
# <chr> <dbl> <dbl> <dbl> <chr> <chr>
# 1 C. canorus 0.772 0 0 "*" rs: 0.77*
# 2 C. optatus 0.221 0.427 0.853 " (n.s.)" rs: 0.22 (n.s.)
cols = c('darkorchid2','orange')
dfs$Species = factor(dfs$Species,levels=c('C. canorus','C. optatus'))
#lmm, account for P1 and P2
model_summaries <- dfs %>%
group_by(Species) %>%
do({
model <- lmer(log(pmax(mean, 0.005)) ~ log(pmax(Distance_km, 0.005)) + (1|P1) + (1|P2), data = .)
summary_df <- broom.mixed::tidy(model, effects = "fixed", conf.int = TRUE)
}) %>%
dplyr::bind_rows()
#output fixed effect of distance, bonferroni correction
model_summaries %>% filter(grepl('Distance',term)) %>% select(-effect,-term) %>% ungroup %>%
mutate(padj = p.adjust(p.value,method='bonferroni'),
signif = ifelse(padj < 0.05,'*','n.s.'))
# Species estimate std.error statistic df p.value conf.low conf.high padj signif
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 C. canorus 0.743 0.0465 16.0 98.8 3.82e-29 0.651 0.836 7.64e-29 *
# 2 C. optatus 0.446 0.0691 6.45 5.43 9.77e- 4 0.272 0.619 1.95e- 3 *
# Create the plot
pp1 = ggplot(dfs, aes(x = log(pmax(Distance_km,0.005)), y = log(pmax(mean,0.005)), color = Species, shape = Species)) +
geom_point(size=1) + #0.25 for main plot
geom_smooth(method = "lm", se = TRUE,lwd=0.75) + #0.5 for main plot
geom_text(data = cors[1,], aes(x = -Inf, y = Inf, label = paste0('C. canorus rho: ',round(rho,3))), size = 3,vjust = 2, hjust = -.25) +
geom_text(data = cors[2,], aes(x = -Inf, y = Inf, label = paste0('C. optatus rho: ',round(rho,3))), size = 3,vjust = 4, hjust = -.25) +
labs(title = "Relationship between FST and Distance_km",
x = "log(Geographic Distance (km))",
y = "log(FST)") +
scale_color_manual(values=cols)+
theme_bw(base_size=6) + theme(legend.position = 'none')
pp1
ggsave('~/merondun/cuculus_migration/figures/20250115_FSTvGeoDistanceSpecies.pdf',pp1,height=3,width=3.5,dpi=300)
#### Fst / (1 - FST) and log(dist)
cors_rousset = dfs %>% group_by(Species) %>%
summarize(
rho = cor(mean, Distance_km,method='spearman'),
p_value = cor.test( (mean / (1 - mean)), Distance_km,method='spearman')$p.value) %>%
mutate(label = paste0('rs: ',round(rho, 2)))
cors_rousset
#lmm, account for P1 and P2
model_summaries_rousset <- dfs %>%
group_by(Species) %>%
do({
model <- lmer( (mean / (1 - mean)) ~ log(pmax(Distance_km, 0.005)) + (1|P1) + (1|P2), data = .)
summary_df <- broom.mixed::tidy(model, effects = "fixed", conf.int = TRUE)
}) %>%
dplyr::bind_rows()
#output fixed effect of distance, bonferroni correction
model_summaries_rousset %>% filter(grepl('Distance',term)) %>% select(-effect,-term) %>% ungroup %>%
mutate(padj = p.adjust(p.value,method='bonferroni'),
signif = ifelse(padj < 0.05,'*','n.s.'))
# Species estimate std.error statistic df p.value conf.low conf.high padj signif
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
# 1 C. canorus 0.0310 0.00220 14.1 89.9 1.85e-24 0.0266 0.0354 3.70e-24 *
# 2 C. optatus 0.00737 0.000740 9.96 5.14 1.49e- 4 0.00548 0.00926 2.98e- 4 *
# Create the plot
pp2 = ggplot(dfs, aes(x = log(pmax(Distance_km,0.005)), y = (mean / (1 - mean)), color = Species, shape = Species)) +
geom_point(size=1) + #0.25 for main plot
geom_smooth(method = "lm", se = TRUE,lwd=0.75) + #0.5 for main plot
geom_text(data = cors[1,], aes(x = -Inf, y = Inf, label = paste0('C. canorus rho: ',round(rho,3))), size = 3,vjust = 2, hjust = -.25) +
geom_text(data = cors[2,], aes(x = -Inf, y = Inf, label = paste0('C. optatus rho: ',round(rho,3))), size = 3,vjust = 4, hjust = -.25) +
labs(title = "Relationship between FST and Distance_km",
x = "log(Geographic Distance (km))",
y = "FST / (1 - FST)") +
scale_color_manual(values=cols)+
theme_bw(base_size=6) + theme(legend.position = 'none')
pp2
ggsave('~/merondun/cuculus_migration/figures/20250115_FSTvGeoDistanceRoussetSpecies.pdf',pp1,height=3,width=3.5,dpi=300)
### Plot ADMIXTURE
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/unrelated_all/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(terra)
library(tidyverse)
library(viridis)
library(stringr)
library(meRo) #devtools::install_github('merondun/meRo')
library(LEA) #install with bioconductor, you don't actually need this if you impot your own q-matrix
library(tess3r) #install with github devtools
library(rworldmap) #for ggplot mapping
library(sf) #for spatial plotting of distance vectors to confirm
library(ggspatial) #to add scale bars onto maps
library(ggpubr)
library(RColorBrewer)
library(ggnewscale)
library(geosphere)
#### Identify western and eastern strongholds ####
#we want to assign migratory tracks based on their breeding locations, so first, let's import the K1/K2 ancestry coefficients and find the longitude of the most 100% western and eastern groups as our threshold
# canorus: K1 = WEST, optatus: K1 = EAST
md <- read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
#create a bounding box for each species based on the K1 and K2 strongholds
thresholds = md %>%
drop_na(K1) %>%
# Assign samples based on their qvalue and species
mutate(Group = ifelse(K1 > 0.99 & SpeciesShort == 'CC','CCW',
ifelse(K1 < 0.01 & SpeciesShort == 'CC','CCE',
ifelse(K1 > 0.99 & SpeciesShort == 'CO','COE',
ifelse(K1 < 0.01 & SpeciesShort == 'CO','COW','Unassigned'))))) %>%
filter(Group != 'Unassigned') %>%
group_by(Group) %>%
summarize(longmn = min(Longitude),
longmx = max(Longitude),
latmn = min(Latitude),
latmx = max(Latitude))
thresholds
# Plot those regions
map.polygon <- getMap(resolution = "high")
strong = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
new_scale_fill()+
new_scale_color()+
geom_rect(data = thresholds, aes(xmin = longmn, xmax = longmx, ymin=latmn, ymax=latmx, fill=Group),alpha=0.5,col='black',lty=2) +
scale_fill_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","white" ))+
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
png('../../figures/ADMIXTURE_StrongholdsK2_2024MAY15.png',units='in',res=300,height=4,width=7)
strong
dev.off()
#### Add Migration Data ####
m <- read_csv('../../spatial/03-BoD-deaths-removed-no-NDVI.csv')
m %>% select(dataset, individual.taxon.canonical.name,tag.local.identifier) %>% unique %>% count(individual.taxon.canonical.name)
tracks_df = m %>% filter(!grepl('satura',individual.taxon.canonical.name)) %>% select(individual.taxon.canonical.name,timestamp,Route,Wolf,tag.local.identifier,location.long,location.lat)
names(tracks_df) = c('Species','DT','Route','WolfID','ID','Long','Lat')
#assign 'year1', 'year2', 'year3', etc, starting with 'november' as the base
tracks_df2 = tracks_df %>%
group_by(ID) %>%
mutate(DT = mdy_hm(DT)) %>%
arrange(DT) %>%
# For each cuckoo, assign a year identifier - the 'year' starts at November as they were rung in the summer, so summer - nov = year 0
# This will also add the number of days in between GPS locations
mutate(
Year_Num = cumsum(month(DT) == 11 & lag(month(DT), default = 10) != 11), # Increment each November
Year_Label = paste0("Year", Year_Num),
Days_Between = as.numeric(DT - lag(DT),units='days')
) %>%
ungroup() %>%
# Also add how many years were sampled for each cuckoo
# This will also assign the geographic distance (in km) between consecutive GPS positions
group_by(ID) %>%
mutate(Years = n_distinct(Year_Label),
Prev_Long = lag(Long),
Prev_Lat = lag(Lat),
Distance = (distHaversine(cbind(Prev_Long, Prev_Lat), cbind(Long, Lat))/1000)) %>%
ungroup %>% select(-Prev_Long,-Prev_Lat) %>%
# And finally, add a case condition for if the points are during a typical breeding or non-breeding season
mutate(
Status = case_when(
month(DT) %in% c(1,12) ~ "Overwintering",
month(DT) %in% c(6) ~ "Breeding",
#(month(DT) == 5 & day(DT) >= 15) | (month(DT) == 6 & day(DT) <= 15) ~ "Breeding",
TRUE ~ NA_character_
)) %>%
# Replace NA in distance / days_between with 0 since they are the initial point
replace_na(list(Days_Between = 0, Distance = 0))
# Plot time on the x axis, with the distance between points on the y axis for all indivdiauls by color to see if there are migration peak times
monthdist = tracks_df2 %>%
ggplot(aes(x=month(DT),y=Distance,col=ID))+
geom_point()+
geom_line()+
xlab('Month')+ylab('Distance between fixes (km)')+
theme_bw()+
theme(legend.position='none')
png('../../figures/Migration_MonthlyDistances_2024MAY15.png',units='in',res=300,height=3,width=6)
monthdist
dev.off()
# # Distance traveled by month
meandist = tracks_df2 %>%
group_by(month(DT)) %>%
sum_stats(Distance) %>%
ggplot(aes(x=as.factor(`month(DT)`),y=mean,ymin=conf_low,ymax=conf_high))+
geom_point()+ ylab('Sequential Distance (km)')+xlab('Month')+
geom_errorbar()+
theme_bw()
png('../../figures/Migration_MeanMonthlyDistances_2024MAY15.png',units='in',res=300,height=3,width=6)
meandist
dev.off()
# Summaries of distance between points and time between points
tracks_df2 %>%
select(Species,ID,Days_Between,Distance) %>%
pivot_longer(c(Days_Between,Distance)) %>%
ggplot(aes(x=value,fill=Species))+
geom_histogram()+
facet_wrap(name~.,scales='free')+
theme_bw()
#here, simply assign mean breeding as mean location over all years in june
breed_dat <- tracks_df2 %>%
#filter(Days_Between < 10 & Distance < 1000) %>%
#filter(Status == 'Breeding') %>%
#filter(Status == 'Overwintering') %>%
drop_na(Status) %>%
group_by(Species,ID,Status) %>%
# Assign mean lat and long during the breeding season
summarize(Longitude = mean(Long), Latitude=mean(Lat)) %>%
ungroup
# Function to check if coordinates fall within a bounding box
in_bbox <- function(longitude, latitude, bbox) {
longitude >= bbox$longmn & longitude <= bbox$longmx & latitude >= bbox$latmn & latitude <= bbox$latmx
}
# Assign breeding locations to groups based on species and bounding boxes
breed_dat <- breed_dat %>%
mutate(Group = case_when(
Species == "Cuculus canorus" & Status == "Breeding" & in_bbox(Longitude, Latitude, thresholds[thresholds$Group == "CCE", ]) ~ "CCE",
Species == "Cuculus canorus" & Status == "Breeding" & in_bbox(Longitude, Latitude, thresholds[thresholds$Group == "CCW", ]) ~ "CCW",
Species == "Cuculus optatus" & Status == "Breeding" & in_bbox(Longitude, Latitude, thresholds[thresholds$Group == "COW", ]) ~ "COW",
Species == "Cuculus optatus" & Status == "Breeding" & in_bbox(Longitude, Latitude, thresholds[thresholds$Group == "COE", ]) ~ "COE",
TRUE ~ "Other"
))
# Confirm
breed_assign = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
new_scale_fill()+
new_scale_color()+
geom_rect(data = thresholds, aes(xmin = longmn, xmax = longmx, ymin=latmn, ymax=latmx, fill=Group),alpha=0.5,col='black',lty=2) +
geom_point(data=breed_dat %>% filter(Status == 'Breeding'),aes(fill=Group,x=Longitude,y=Latitude,shape=Species),col='black')+
scale_fill_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","white" ))+
scale_shape_manual(values=c(21,24))+
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
breed_assign
png('../../figures/ADMIXTURE_StrongholdsK2-Assigned_2024MAY15.png',units='in',res=300,height=4,width=7)
breed_assign
dev.off()
# now that we have assigned each individual a unique ID, let's merge this with the full frame
breed_id <- breed_dat %>% ungroup %>% select(ID,Group) %>% unique
tracks_df3 <- left_join(tracks_df2,breed_id) %>%
#Individuals which did not have june locations should also be assigned 'Other'
replace_na(list(Group = 'Other'))
tracks_df3 %>% distinct(ID,Group) %>% count(Group)
# # A tibble: 5 × 2
# Group n
# <chr> <int>
# 1 CCE 20
# 2 CCW 110
# 3 COE 6
# 4 COW 4
# 5 Other 202
#create points
tracks = st_as_sf(tracks_df3 %>% mutate(Alpha = ifelse(Group == 'Other','Other','Target')), coords = c("Long", "Lat"),
crs = 4326, agr = "constant")
#create lines
tracks_lines <- tracks %>%
group_by(ID, Species, Group, Alpha) %>%
filter(n() >= 2 ) %>% # Keep only groups with at least two records
summarize(geometry = st_union(geometry)) %>%
st_cast("LINESTRING") %>%
st_as_sf()
migp = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
new_scale_fill()+
new_scale_color()+
geom_sf(data=tracks_lines,aes(col=Group,group=ID,alpha=Alpha)) +
scale_color_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","grey40" ))+
scale_alpha_manual(breaks=c('Other','Target'),values=c(0.2,0.8))+
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
migp
png('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Migration_Map_AssignedK2_2024MAY14.png',units='in',res=300,height=4,width=7)
migp
dev.off()
#### For examining overwintering grounds #####
library(concaveman)
library(smoothr)
#Only select locations in january for our 4 target groups
winter = tracks_df3 %>%
filter(Group != 'Other' & (month(DT) %in% c(1)))
winter_tracks <- st_as_sf(winter, coords = c("Long", "Lat"), crs = 4326, agr = "constant")
winter_poly = list()
for (group in unique(winter_tracks$Group)) {
cat('Making polygon for group: ',group,'\n')
t <- winter_tracks %>% filter(Group == group)
#concavity is arbitrary, higher number provides a smoother polygon
poly <- concaveman(t,concavity=5,length_threshold=2.5) %>%
mutate(Group = group)
winter_poly[[group]] = poly
}
winterp = do.call(rbind,winter_poly)
winter_grounds = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
new_scale_fill()+
new_scale_color()+
geom_sf(data=tracks_lines,aes(col=Group,group=ID,alpha=Alpha)) +
scale_color_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","grey40" ))+
scale_alpha_manual(breaks=c('Other','Target'),values=c(0.2,0.8))+
geom_sf(data = winterp,aes(fill=Group),alpha=0.6,col='black',lty=3,lwd=0.5)+
scale_fill_manual(breaks=c('CCW','CCE','COW','COE','Other'),values=c("#1F78B4","#A6CEE3","#E31A1C","#FB9A99","grey40" ))+
xlab("Longitude") + ylab("Latitude") +
theme_classic(base_size=8)+
coord_sf(xlim = c(-20,185),
ylim = c(-30,75), expand = FALSE)+
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
winter_grounds
png('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Migration_Map_Overwintering_AssignedK2_2024MAY14.png',units='in',res=300,height=4,width=7)
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Migration_Map_Overwintering_AssignedK2_2024MAY14.pdf',height=4,width=7)
winter_grounds
dev.off()
# #Calculate distance between mean breeding locations across years
# dist_dat = NULL
# for (id in unique(multi_breed_dat$ID)){
# s <- multi_breed_dat %>% filter(ID == id)
# yeardat = NULL
#
# #now loop through each consecutive year
# for (year in seq(1,nrow(s)-1,1)){
# d <- s %>% mutate(Distance = distHaversine(cbind(Longitude[year],Latitude[year]),cbind(Longitude[year+1],Latitude[year+1]))/1000) %>% ungroup %>% select(Species,ID,Distance) %>% unique
# yeardat = rbind(yeardat,d %>% mutate(Year = year))
# }
# dist_dat = rbind(dist_dat,yeardat)
# }
#
# #plot the distances observed between sequential years
# dist_dat %>% ggplot(aes(x=Distance,fill=Species))+
# geom_histogram()+
# theme_bw()
#
# #Plot Breeding Locations Over Years for the individuals with low breeding ground fidelity
# plot_breed = tracks_df2 %>%
# #filter(Status == 'Breeding') %>%
# filter(Status == 'Overwintering') %>%
# group_by(ID) %>% mutate(BreedYears = n_distinct(Year_Label)) %>% filter(BreedYears>1)
# plot_ids = dist_dat %>% arrange(desc(Distance)) %>% pull(ID) %>% head(3)
# plot_sub = plot_breed %>% filter(ID %in% plot_ids)
#
# #Just plot the first 5 individuals
# ggplot() +
# geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='grey90',lwd=0.2) +
# coord_sf(xlim = c(-20,185),
# ylim = c(-30,75), expand = FALSE)+
# new_scale_fill()+
# new_scale_color()+
# geom_point(data = plot_sub, aes(x = Long, y = Lat,col=Year_Label,shape=ID), size = 2) +
# xlab("Longitude") + ylab("Latitude") +
# scale_shape_manual(values=c(16,3,8))+
# scale_color_manual(values=viridis(length(unique(plot_sub$Year_Label))))+
# theme_classic(base_size=8)+
# theme(panel.border = element_rect(colour = "black", fill=NA, size=1),panel.background = element_rect(fill = "aliceblue"))+
# theme(legend.position = 'top',legend.text = element_text(size = 6),legend.title = element_text(size = 6),legend.key.size = unit(0.1, 'cm'))
First, ascribing a bounding box based on the genetic ancestry Q > 99% for west / east.
Strongholds of > 99% ancestry coefficients for western and eastern groups. Individuals with a mean breeding area within these boxes are assigned to each group.
Distances traveled in each month
Mean monthly averages between fixes, seems like june and january are the best months to select for breeding and overwintering.
For each individual’s tracking data, I calculated the mean lat/long across all years in June. I then intersected this with the bounding boxes to assign each individual into a genetic stronghold.
We can then color the tracks based on the individual’s ancestry:
Great, now let’s identify overwintering locations for the 4 groups. Unfortunately we do not have ANY january locations for our optatus east samples, and only a single individual for optatus west, so we cannot make polygons for optatus.
Put them together with the migration tracks:
### Subset N = 40 based on geography.
setwd('~/merondun/cuculus_migration/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(terra)
library(tidyverse)
library(viridis)
library(RColorBrewer)
library(meRo) #devtools::install_github('merondun/meRo')
library(rworldmap)
library(sf) #for spatial plotting of distance vectors to confirm
library(ggspatial) #to add scale bars onto maps
library(ggpubr)
library(lubridate)
library(geosphere)
#read in metadata
md = read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
#popgen dat
all = md %>% filter(Analysis_ADMIXTURE == 1 ) %>%
mutate(Species = gsub('C. optatus','Cuculus optatus',
gsub('C. c. canorus','Cuculus canorus',
gsub('C. c. bakeri','Cuculus canorus',Species_Latin))),
LatJit = jitter(Latitude,amount = 1),
LonJit = jitter(Longitude,amount = 1))
#and assign groups
all = all %>%
mutate(Group = ifelse(Species == 'Cuculus canorus' & Latitude > 45 & Longitude < 5, 'CanWest',
ifelse(Species == 'Cuculus canorus' & Latitude > 35 & Longitude > 135 & Longitude < 165,'CanEast',
ifelse(Species == 'Cuculus optatus' & Latitude > 40 & Longitude > 75 & Longitude < 120,'OptWest',
ifelse(Species == 'Cuculus optatus' & Longitude > 135 & Longitude < 150,'OptEast',
'Other')))))
all %>% count(Group)
alljit = all %>% mutate(loj = jitter(Longitude,amount=1),laj = jitter(Latitude,amount=1))
popgen_points = st_as_sf(alljit, coords = c("loj", "laj"),
crs = 4326, agr = "constant", remove=F)
#plot breeding polygons
cols = brewer.pal(12,'Paired')[c(1,2,5,6)]
map.polygon <- getMap(resolution = "low")
unrel_breed = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='black',lwd=0.1) +
geom_sf(data = popgen_points,aes(fill=Group,shape=Species),alpha=0.75,size=2,stroke=0.25)+
#facet_wrap(Species~.)+
xlab('') + ylab('') +
scale_shape_manual(values=c(21,24))+
scale_fill_manual(values=c(cols,'white'))+
coord_sf(xlim = c(-10,185),
ylim = c(-30,75), expand = FALSE)+ #coord_equal() + #FOR PUBLICATION MAIN FIGURE USE THIS, WON'T STRETCH / DISTORT LATITUDES
theme_classic() +
guides(fill=guide_legend(nrow=2,override.aes=list(shape=22)))+
theme(panel.border = element_rect(colour = "black", fill = NA, size = 1),
panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.key.size = unit(0.5, 'cm'),
legend.position = 'top') +
annotation_scale(line_width = 0.5)
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Unrelated_Breeding_N190_Distribution_2024APR26.pdf',height=4,width=7)
unrel_breed
dev.off()
all %>% filter(Group != 'Other') %>% group_by(Group) %>% sum_stats(MeanCoverage)
all %>% filter(Group != 'Other') %>% count(Species,Group) %>%
ggplot(aes(x=Group,fill=Group,y=n,label=n))+geom_bar(stat='identity')+
geom_text(vjust=-0.2)+ylab('Count Unrelated Individuals')+xlab('Migratory Group')+
scale_fill_manual(values=cols)+
facet_grid(.~Species,scales='free')+
theme_bw(base_size=18) + theme(legend.position='none')
write.table(all %>% ungroup %>% filter(Group != 'Other') %>% select(ID,Group),file='~/merondun/cuculus_migration/Samples_Demography_All_CCW-CCE-COW-COE_2024MAY02.pop',quote=F,sep='\t',row.names=F,col.names=F)
#retain n = 10 samples from each stronghold with the highest coverage
kept = all %>% filter(Group != 'Other') %>% group_by(Species,Group) %>% slice_max(MeanCoverage,n=10)
kept %>% count(Species,Group)
kept %>% group_by(Species,Group) %>% sum_stats(MeanCoverage)
kept %>% count(Group)
write.table(kept %>% ungroup %>% select(ID,Group),file='~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024APR26.pop',quote=F,sep='\t',row.names=F,col.names=F)
write.table(kept %>% ungroup %>% select(ID),file='~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024APR26.list',quote=F,sep='\t',row.names=F,col.names=F)
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_highmem
#SBATCH --cpus-per-task=10
#SBATCH --time=48:00:00
CHR=$1
mkdir full_vcf
raw_vcf_dir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/merged/snps_only/
#From the full gVCF, including invariant sites, subset only samples of interest
bcftools view --threads 10 --types snps --samples-file ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024MAR13.list ${raw_vcf_dir}/${CHR}.SNPS.vcf.gz -Ou | \
bcftools view --min-ac 1 --min-alleles 2 --max-alleles 2 -Oz -o full_vcf/${CHR}.raw.vcf.gz
#Apply filtering, only on MQ, DP
bcftools view --threads 10 --max-alleles 2 -i 'MQ > 30 && INFO/DP > 150 && QUAL > 20' -Oz -o full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz full_vcf/${CHR}.raw.vcf.gz
bcftools index full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz
#Phase VCF with beagle
java -jar -Xmx160g ~/modules/beagle.28Jun21.220.jar gt=full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz out=full_vcf/${CHR}.MQ20-DP150-Q20.PHASED nthreads=8 window=40 overlap=2 impute=true
bcftools index --threads 10 full_vcf/${CHR}.MQ20-DP150-Q20.PHASED.vcf.gz
#add the INFO DP,MQ and FMT/DP annotations back onto this VCF, from the pre-phased VCF
bcftools annotate --threads 10 -a full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz -c INFO/DP,INFO/MQ,FMT/DP full_vcf/${CHR}.MQ20-DP150-Q20.PHASED.vcf.gz -Oz -o full_vcf/${CHR}.MQ20-DP150-Q20.PHASED-ANN.vcf.gz
bcftools index --threads 10 full_vcf/${CHR}.MQ20-DP150-Q20.PHASED-ANN.vcf.gz
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=serial
#SBATCH --partition=serial_std
#SBATCH --mem=30000mb
#SBATCH --cpus-per-task=10
#SBATCH --time=12:00:00
#neutral sites
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/unrelated_breeding/neutral-sites_cuckoo__intergenic-intron-4fold.bed
#this bed contains BAD sites which are repeats (bed does not include 'low_complexity' and 'simple_repeats', so it's largely TEs)
repeats=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/unrelated_breeding/GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
MINDP=5
bcftools concat --threads 10 -Ou full_vcf/*Q20.vcf.gz | \
bcftools sort -Ou | \
#set genotypes below 5x to missing
bcftools +setGT -Ou -- -t q -i "FMT/DP < ${MINDP}" -n "./." | \
#update AC fields after changing genotypes to missing
bcftools +fill-tags -Ou -- -t AC,AN | \
bcftools view --min-ac 2 -e 'F_MISSING > 0.1' -Oz -o merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2.vcf.gz
bcftools view --threads 10 merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2.vcf.gz -Ov | \
#RETAIN neutral sites
bedtools intersect -header -a - -b $neutral | \
#EXCLUDE repeats
bedtools subtract -header -a - -b $repeats | \
#LD PRUNE
bcftools +prune -m 0.2 --window 50 -Oz -o merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2-Neutral-LDr2w50.vcf.gz
bcftools index merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2-Neutral-LDr2w50.vcf.gz
~/modules/plink2 --threads 10 --vcf merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2-Neutral-LDr2w50.vcf.gz --chr-set 20 --allow-extra-chr --set-missing-var-ids @:# --make-bed --out merged/Autosomes.MQ20-DP150-Q20-MM1-MAC2-Neutral-LDr2w50
Subsample N=20 per group for sensitivity:
### Subset N = 80 based on geography.
setwd('~/merondun/cuculus_migration/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(terra)
library(tidyverse)
library(viridis)
library(RColorBrewer)
library(meRo) #devtools::install_github('merondun/meRo')
library(rworldmap)
library(sf) #for spatial plotting of distance vectors to confirm
library(ggspatial) #to add scale bars onto maps
library(ggpubr)
library(lubridate)
library(geosphere)
#read in metadata
md = read_tsv('~/merondun/cuculus_migration/Full_Metadata.txt')
#popgen dat
all = md %>% filter(Analysis_ADMIXTURE_Breeding == 1 ) %>%
mutate(Species = gsub('C. optatus','Cuculus optatus',
gsub('C. canorus','Cuculus canorus',Species_Latin)),
LatJit = jitter(Latitude,amount = 1),
LonJit = jitter(Longitude,amount = 1))
#and assign groups: OLDER, only N=40
all = all %>%
mutate(Group = ifelse(Species == 'Cuculus canorus' & Latitude > 45 & Longitude < 5, 'CCW',
ifelse(Species == 'Cuculus canorus' & Latitude > 35 & Longitude > 135 & Longitude < 165,'CCE',
ifelse(Species == 'Cuculus optatus' & Latitude > 40 & Longitude > 75 & Longitude < 120,'COW',
ifelse(Species == 'Cuculus optatus' & Longitude > 135 & Longitude < 150,'COE',
'Other')))))
#and assign groups: VERY flexible, greater numbers
all = all %>%
mutate(Group = ifelse(Species == 'Cuculus canorus' & Latitude > 45 & Longitude < 40, 'CCW',
ifelse(Species == 'Cuculus canorus' & Latitude > 35 & Longitude > 100 & Longitude < 165,'CCE',
ifelse(Species == 'Cuculus optatus' & Longitude < 120,'COW',
ifelse(Species == 'Cuculus optatus' & Longitude > 120 ,'COE',
'Other')))))
all %>% count(Group)
alljit = all %>% mutate(loj = jitter(Longitude,amount=1),laj = jitter(Latitude,amount=1))
popgen_points = st_as_sf(alljit, coords = c("loj", "laj"),
crs = 4326, agr = "constant", remove=F)
#plot breeding polygons
cols = brewer.pal(12,'Paired')[c(1,2,5,6)]
map.polygon <- getMap(resolution = "low")
unrel_breed = ggplot() +
geom_polygon(data = map.polygon, aes(x = long, y = lat, group = group),fill='white',col='black',lwd=0.1) +
geom_sf(data = popgen_points,aes(fill=Group,shape=Species),alpha=0.75,size=2,stroke=0.25)+
#facet_wrap(Species~.)+
xlab('') + ylab('') +
scale_shape_manual(values=c(21,24))+
scale_fill_manual(values=c(cols,'white'))+
coord_sf(xlim = c(-10,185),
ylim = c(-30,75), expand = FALSE)+ #coord_equal() + #FOR PUBLICATION MAIN FIGURE USE THIS, WON'T STRETCH / DISTORT LATITUDES
theme_classic() +
guides(fill=guide_legend(nrow=2,override.aes=list(shape=22)))+
theme(panel.border = element_rect(colour = "black", fill = NA, size = 1),
panel.background = element_rect(fill = "aliceblue"),
legend.text = element_text(size = 10),
legend.title = element_text(size = 10),
legend.key.size = unit(0.5, 'cm'),
legend.position = 'top') +
annotation_scale(line_width = 0.5)
unrel_breed
png('deprecated_sampling/double_sampling_2024AUG/Distribution_Groups_2024AUG12.png',units='in',res=300,height=6,width=10)
unrel_breed
dev.off()
all %>% filter(Group != 'Other') %>% group_by(Group) %>% sum_stats(MeanCoverage)
full_set <- all %>% filter(Group != 'Other') %>% select(ID,Group)
write.table(full_set,file='~/merondun/cuculus_migration/admixture/Population_Designations_Full.pop',quote=F,sep='\t',row.names=F)
#retain n = 20 samples from each stronghold with the most similar coverage
kept <- all %>% filter(Group != 'Other') %>% select(ID,Species,Group,Latitude,Longitude,MeanCoverage) %>%
mutate(Overall = mean(MeanCoverage), Diff = abs(MeanCoverage - Overall)) %>% # Calculate difference from mean
group_by(Species,Group) %>% slice_min(Diff,n=20)
kept %>% count(Species,Group)
covplot <- kept %>% group_by(Species,Group) %>% sum_stats(MeanCoverage) %>%
ggplot(aes(x=Group,y=mean,ymin=conf_low,ymax=conf_high,col=Group))+
geom_errorbar()+
geom_point(size=2.5)+ylab('Coverage by Group (mean & 95% CIs')+
scale_color_manual(values=c(cols,'white'))+
theme_bw()
png('deprecated_sampling/double_sampling_2024AUG/Coverage_Groups_2024AUG12.png',units='in',res=300,height=4,width=5)
covplot
dev.off()
kept %>% count(Group)
write.table(kept %>% ungroup %>% select(ID,Group),file='~/merondun/cuculus_migration/deprecated_sampling/double_sampling_2024AUG/Samples_Demography_N20_CCW-CCE-COW-COE_2024AUG12.pop',quote=F,sep='\t',row.names=F,col.names=F)
write.table(kept %>% ungroup %>% select(ID),file='~/merondun/cuculus_migration/deprecated_sampling/double_sampling_2024AUG/Samples_Demography_N20_CCW-CCE-COW-COE_2024AUG12.list',quote=F,sep='\t',row.names=F,col.names=F)
And as before, subset for folded SFS:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_production
#SBATCH --cpus-per-task=3
#SBATCH --time=200:00:00
# mamba activate snps
# submit as for i in $(cat Chromosomes.list); do sbatch -J FILTER_${i} ~/merondun/cuculus_migration/snp_calling/branch_folded/7C.Subsample_Demography_N80Subset.sh ${i}; done
CHR=$1
#filtered vcfs directory
vcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/chromosome_vcfs
outvcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/subsampled_n80_vcfs
mkdir $outvcfs $outvcfs/folded $outvcfs/unfolded $outvcfs/invariant_mm1 $outvcfs/raw $outvcfs/stats
#samples
samples=/dss/dsshome1/lxc07/di39dux/merondun/cuculus_migration/deprecated_sampling/double_sampling_2024AUG/Samples_Demography_N20_CCW-CCE-COW-COE_2024AUG12.list
#this bed contains GOOD regions which are neutral for demography
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/extracting_neutral_sites/neutral-sites_cuckoo__intergenic-intron-4fold.bed
#this bed contains BAD sites which are repeats (bed does not include 'low_complexity' and 'simple_repeats', so it's largely TEs)
repeats=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/GCA_017976375.1_bCucCan1.pri_genomic.CHR_strip.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
### Generate Depth statistics for filtering on DP, retain only sites within mean coverage + 2*sd or mean covearge - 2*sd
bcftools query -f '%CHROM\t%POS\t%INFO/DP\n' ${vcfs}/${CHR}_raw.vcf.gz > ${outvcfs}/stats/${CHR}_raw.dp_stats.txt
mean=$(cat ${outvcfs}/stats/${CHR}_raw.dp_stats.txt | datamash mean 3 | xargs printf "%.0f\n")
sd=$(cat ${outvcfs}/stats/${CHR}_raw.dp_stats.txt | datamash sstdev 3 | xargs printf "%.0f\n")
# Round to nearest integer
low=$(echo "$mean - 2*$sd" | bc)
high=$(echo "$mean + 2*$sd" | bc)
#subset samples first, and filter for MQ. Right at the beginning, remove repeats and non-neutral sites
MINDP=5
bcftools view --threads 5 --samples-file ${samples} -Ov ${vcfs}/${CHR}_raw.vcf.gz | \
#RETAIN neutral sites
bedtools intersect -header -a - -b $neutral | \
#EXCLUDE repeats
bedtools subtract -header -a - -b $repeats | \
#SET genotypes below MINDP to missing
bcftools +setGT -Ou -- -t q -i "FMT/DP < ${MINDP}" -n "./." | \
#UPDATE AC fields after changing genotypes to missing
bcftools +fill-tags -Ou -- -t AC,AN | \
#RETAIN only sites with at least 90% genotypes
bcftools view --threads 5 -i "MQ > 30 & F_MISSING < 0.1 & INFO/DP > ${low} & INFO/DP < ${high}" -Oz -o ${outvcfs}/raw/${CHR}_allsitesN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
bcftools index --threads 5 ${outvcfs}/raw/${CHR}_allsitesN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
### for FOLDED SFS, just subset
bcftools view --threads 5 --types snps --min-alleles 2 --max-alleles 2 --min-ac 1 --max-af 0.9999 -i 'QUAL > 20' -Oz -o ${outvcfs}/folded/${CHR}_snpsN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz ${outvcfs}/raw/${CHR}_allsitesN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
bcftools index --threads 5 ${outvcfs}/folded/${CHR}_snpsN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
### And then simply grab the invariant sites
bcftools view --threads 5 --max-ac 0 -Oz -o $outvcfs/invariant_mm1/${CHR}_invariantN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz ${outvcfs}/raw/${CHR}_allsitesN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
bcftools index --threads 5 $outvcfs/invariant_mm1/${CHR}_invariantN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz
### Summarize SNP Counts in tidy format
raw=$(bcftools index -n ${vcfs}/${CHR}_raw.vcf.gz)
neutral=$(bcftools index -n ${outvcfs}/raw/${CHR}_allsitesN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz)
folded=$(bcftools index -n ${outvcfs}/folded/${CHR}_snpsN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz)
invariant=$(bcftools index -n $outvcfs/invariant_mm1/${CHR}_invariantN80.MQ-5X-MM1-Neutral-NonRepetitive.vcf.gz)
echo -e "${CHR}\tRaw\t${raw}" > ${outvcfs}/stats/${CHR}.snp.counts
echo -e "${CHR}\tNeutral_MM1\t${neutral}" >> ${outvcfs}/stats/${CHR}.snp.counts
echo -e "${CHR}\tFolded\t${folded}" >> ${outvcfs}/stats/${CHR}.snp.counts
echo -e "${CHR}\tInvariant\t${invariant}" >> ${outvcfs}/stats/${CHR}.snp.counts
Subsetting n = 2 from the target and outgroup species, run admixture:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=10
#SBATCH --time=48:00:00
CHR=chr_1
vcf=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/chromosome_vcfs/unrelated_all/${CHR}_snp.MQ-5X.vcf.gz
wd=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/species_distinction
list=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/species_distinction/ID.list
# Only LD Pruning
bcftools view --threads 10 --samples-file ${list} -Ou ${vcf} | \
bcftools view --threads 10 --types snps --min-alleles 2 --max-alleles 2 --min-ac 1 --max-af 0.9999 -i 'F_MISSING < 0.2' | \
bcftools +prune -m 0.8 --window 50 -Oz -o ${wd}/${CHR}_snp.MQ-5X-LDr8w50.vcf.gz
bcftools index --threads 10 ${wd}/${CHR}_snp.MQ-5X-LDr8w50.vcf.gz
#Create plink bed files for admixture + perform pca
plink --threads 10 --const-fid --vcf ${wd}/${CHR}_snp.MQ-5X-LDr8w50.vcf.gz --chr-set 29 --allow-extra-chr --set-missing-var-ids @:# \
--make-bed --pca --out ${wd}/${CHR}_snp.MQ-5X-LDr8w50
cd ${wd}/admixture
for K in {2..10}; do
echo "RUNNING K: ${K}"
#Run Admixture
admixture -j7 --cv=5 ../${CHR}_snp.MQ-5X-LDr8w50.bed ${K} > ${CHR}_snp.MQ-5X-LDr8w50.log${K}.out
#Evaluate Runs
~/modules/evalAdmix/evalAdmix -plink ../${CHR}_snp.MQ-5X-LDr8w50 -fname ${CHR}_snp.MQ-5X-LDr8w50.${K}.P -qname ${CHR}_snp.MQ-5X-LDr8w50.${K}.Q -P 10 -o eval_${CHR}_snp.MQ-5X-LDr8w50.${K}
done
Also calculate FST:
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=10
#SBATCH --time=48:00:00
TMPDIR=/dss/dsslegfs01/pr53da/pr53da-dss-0021/tmp
VCF=chr_1_snp.MQ-5X-LDr8w50.vcf.gz
length=1000000000000
tabix ${VCF}
pixy --stats fst --bypass_invariant_check yes --vcf ${VCF} --populations ID.pixypop --window_size ${length} --n_cores 10 --output_folder fst
And plot:
### Plot ADMIXTURE species distinction, n = 2 each
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/species_distinction/all_admix_files')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(viridis)
library(stringr)
library(meRo) #devtools::install_github('merondun/meRo')
library(ggpubr)
library(RColorBrewer)
qdir = '.' #directory with Q files
prefixes = gsub('.2.Q','',list.files('.',pattern='.*.2.Q'))
prefix = 'chr_1_snp.MQ-5X-LDr2w50'
counter = 0
for (prefix in prefixes) {
counter = counter + 1
cat('Working on run: ',prefix,'\n')
admix = melt_admixture(prefix = prefix, qdir = qdir)
#read in metadata
md = read_tsv('../ID.pixypop',col_names = F)
names(md) <- c('ID','Species_Latin')
admixmd = left_join(admix, md %>% select(ID,Species_Latin))
#loop through all admixture runs and extract the average correlation values from evalAdmix, we want to MINIMIZE this! (closest to 0)
evaldat = NULL; for (Kval in seq(2,10,1)){
r <- as.matrix(read.table(paste0("eval_",prefix,'.',Kval)))
mean_value <- mean(r,na.rm=TRUE)
median_value <- median(r,na.rm=TRUE)
sd_value <- sd(r,na.rm=TRUE)
iqr_value <- IQR(r,na.rm=TRUE)
valdat = data.frame(K = Kval,mean = mean_value,median=median_value,sd=sd_value,iqr=iqr_value)
evaldat = rbind(valdat,evaldat)
}
#plot, for main figure show the n=3 lowest median
targs = evaldat %>% slice_min(abs(median),n=3)
ep = evaldat %>%
ggplot(aes(x=K,y=median,ymin=median-iqr,ymax=median+iqr))+
geom_rect(data=targs,aes(xmin=K-0.25,xmax=K+0.25,ymin=-Inf,ymax=Inf),fill='darkseagreen3')+
geom_text(aes(y = 0.15, label = format(signif(median, 2), scientific = TRUE)),size=2) +
#ylim(c(-0.015,0.015))+
geom_hline(yintercept=0,lty=2)+
geom_point()+ylab('Median +/- IQR Correlation of Residuals') +
geom_errorbar()+
theme_bw() +
scale_x_continuous(breaks = seq(min(evaldat$K), max(evaldat$K), by = 1)) +
coord_flip()
ep
assign(paste0('e',1),ep)
png(paste0('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20240806_evalAdmix_',prefix,'_SpeciesDistinction.png'),res=300,units='in',height=5,width=6)
print(e1)
dev.off()
#Now plot ADMIXTURE
admixmd$Species_Latin <- factor(admixmd$Species_Latin,levels=c('C. canorus canorus','C. canorus bakeri','C. optatus','C. saturatus','C. micropterus','C. poliocephalus'))
adplot =
admixmd %>% filter(Specified_K <= 9 ) %>% #specify the levels you want
mutate(Specified_K = paste0('K',Specified_K)) %>%
ggplot(aes(x = factor(ID), y = Q, fill = factor(K))) +
geom_col(color = "gray", size = 0.1) +
facet_grid(Specified_K~Species_Latin, scales = "free", space = "free") +
theme_minimal(base_size=6) + labs(x = "",y = "") +
scale_y_continuous(expand = c(0, 0),n.breaks = 3) +
scale_fill_manual(values=viridis(9,option='turbo'))+
theme(
panel.spacing.x = unit(0.1, "lines"),
#axis.text.x = element_blank(),
axis.text.x=element_blank(),
axis.text.y = element_text(size=5),
panel.grid = element_blank(),
legend.position = 'bottom',
plot.title = element_text(size=6)
)
adplot
pdf(paste0('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20240806_ADMIXTURE_',prefix,'_SpeciesDistinction.pdf'),height=3,width=3)
print(adplot)
dev.off()
}
### Read in FST
fst <- read_tsv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/admixture/species_distinction/FST_Overall.txt')
# Plotting
fst$pop1 <- factor(fst$pop1,levels=c('C. saturatus','C. micropterus','C. poliocephalus','C. canorus bakeri','C. canorus canorus','C. optatus'))
fst$pop2 <- factor(fst$pop2,levels=c('C. saturatus','C. micropterus','C. poliocephalus','C. canorus bakeri','C. canorus canorus','C. optatus'))
fst_plot <- ggplot(fst, aes(x = pop1, y = pop2)) +
geom_tile(aes(fill = avg_wc_fst), colour = "white") + # use 'value' because pivot_wider creates it
geom_text(aes(label = round(avg_wc_fst, 2)),size = 3, vjust = 1) +
scale_fill_gradient(low = "white", high = "red") +
facet_grid(~ filter )+
theme_bw()+
labs(x = "", y = "", fill = "Avg W&C FST") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
fst_plot
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20240806_FST-Heatmap_SpeciesDistinction.pdf',height=3.5,width=8)
fst_plot
dev.off()
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=5
#SBATCH --time=48:00:00
CHR=$1
WORKDIR=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop
cd ${WORKDIR}
mkdir full_vcf
raw_vcf_dir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/merged/snps_only/
#From the full gVCF, including invariant sites, subset only samples of interest
bcftools view --threads 5 --types snps --samples-file ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024MAR13.list ${raw_vcf_dir}/${CHR}.SNPS.vcf.gz -Ou | \
bcftools view --min-ac 1 --min-alleles 2 --max-alleles 2 -Oz -o full_vcf/${CHR}.raw.vcf.gz
#Apply filtering, only on MQ, DP
bcftools view --threads 5 --max-alleles 2 -i 'MQ > 30 && INFO/DP > 150 && QUAL > 20' -Oz -o full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz full_vcf/${CHR}.raw.vcf.gz
bcftools index full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz
#Phase VCF with beagle
java -jar -Xmx160g ~/modules/beagle.28Jun21.220.jar gt=full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz out=full_vcf/${CHR}.MQ20-DP150-Q20.PHASED nthreads=8 window=40 overlap=2 impute=true
bcftools index --threads 5 full_vcf/${CHR}.MQ20-DP150-Q20.PHASED.vcf.gz
#add the INFO DP,MQ and FMT/DP annotations back onto this VCF, from the pre-phased VCF
bcftools annotate --threads 5 -a full_vcf/${CHR}.MQ20-DP150-Q20.vcf.gz -c INFO/DP,INFO/MQ,FMT/DP full_vcf/${CHR}.MQ20-DP150-Q20.PHASED.vcf.gz -Oz -o full_vcf/${CHR}.MQ20-DP150-Q20.PHASED-ANN.vcf.gz
bcftools index --threads 5 full_vcf/${CHR}.MQ20-DP150-Q20.PHASED-ANN.vcf.gz
Mask for sites below coverage, each a masking file for each individual sample
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=2
#SBATCH --time=48:00:00
#submit sample as positional for RUN in $(cat ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW-COE_2024MAR13.list); do sbatch -J COV_${RUN} 2.Coverage_Masks.sh ${RUN} ; done
SAMPLE=$1
bamdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/bams/Illumina_Alignments_Merged
#output mask directory
maskdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/coverage_masks
#directory with the n=40 subsampled phased, neutral and non-repetitive VCFS
vcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/full_vcf
#output directory with the individual chromosome level vcfs
indvcfs=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/individual_vcfs
cd /dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop
mkdir $maskdir $maskdir/work $indvcfs
#loop through each chromosome
for CHR in $(cat Chromosomes.list); do
#create a coverage/MQ mask
mosdepth --threads 2 --chrom ${CHR} $maskdir/work/${SAMPLE}_${CHR} $bamdir/${SAMPLE}.bam
avg=$(awk '{print $4}' $maskdir/work/${SAMPLE}_${CHR}.mosdepth.summary.txt | tail -n 1)
min=$(printf "%.0f" $(echo "$avg / 2" | bc -l))
max=$(printf "%.0f" $(echo "$avg * 2" | bc -l))
echo "FOR SAMPLE: ${SAMPLE} average coverage is ${avg}, retainining positions between ${min} and ${max}"
#grab only sites greater than or below half the chromosome avg or double it. grab sites we want to RETAIN!
zcat $maskdir/work/${SAMPLE}_${CHR}.per-base.bed.gz | awk -v x=${max} -v n=${min} '$4 < x && $4 > n' | awk '{OFS="\t"}{print $1, $2, $3}' | bgzip -c > $maskdir/${SAMPLE}_${CHR}.bed.gz
#finally, subset that sample from the population VCF
bcftools view --threads 2 --samples ${SAMPLE} -Ob full_vcf/${CHR}.MQ20-DP150-Q20.PHASED-ANN.vcf.gz | \
bcftools view --threads 2 --min-alleles 2 --max-alleles 2 --min-ac 1 --max-af 0.9999 -Oz -o $indvcfs/${SAMPLE}_${CHR}.vcf.gz
bcftools index $indvcfs/${SAMPLE}_${CHR}.vcf.gz
done
Run MSMC2 and MSMC-IM on all haplotypes (n=8):
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=15
#SBATCH --time=48:00:00
#mamba activate samtools0.1.19
#submit with script 5, otherwise: sbatch ~/merondun/cuculus_migration/msmc/4.Crosscoalescent_Iterative.sh CCW CCE 1
genome=/dss/dsslegfs01/pr53da/pr53da-dss-0021/assemblies/Cuculus.canorus/VGP.bCucCan1.pri/GCA_017976375.1_bCucCan1.pri_genomic.CHR.fa
#file with the mosdepth coverage masks (sites < 1/2 or > 2 the chromosome-sample-specific coverage are ignored )
maskdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/coverage_masks
#vcfs, individual for sample-chr
vcfdir=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/individual_vcfs
#genome-wide mappability mask
gwmask=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/demography/mappability/masks/
#neutral sites
neutral=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/neutral-sites_cuckoo__intergenic-intron-4fold.bed
#this bed contains BAD sites which are repeats (bed does not include 'low_complexity' and 'simple_repeats', so it's largely TEs)
repeats=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/GCA_017976375.1_bCucCan1.pri_genomic.CHR.AvianRepeatModeler_Repeats-ExcludingLowComplexity-SimpleRepeats.bed
cd /dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop
#submit population 1, population 2, iteration
P1=$1
P2=$2
IT=$3
mkdir -p crosscoal crosscoal/input crosscoal/output
#grab 2 random samples from each population
grep -w ${P1} ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW_20241220.pop | awk '{print $1}' | shuf | head -n 2 > crosscoal/${P1}_${P2}_${IT}.inds
grep -w ${P2} ~/merondun/cuculus_migration/Samples_Demography_N10_CCW-CCE-COW_20241220.pop | awk '{print $1}' | shuf | head -n 2 >> crosscoal/${P1}_${P2}_${IT}.inds
#loop through chromosomes and create the msmc input file
for i in $(cat Chromosomes.list); do
rm crosscoal/${P1}_${P2}_${IT}.${i}_POPmask.list crosscoal/${P1}_${P2}_${IT}.${i}_POPvcf.list
for j in $(cat crosscoal/${P1}_${P2}_${IT}.inds); do
echo "--mask=${maskdir}/${j}_${i}.bed.gz " >> crosscoal/${P1}_${P2}_${IT}.${i}_POPmask.list
echo "${vcfdir}/${j}_${i}.vcf.gz" >> crosscoal/${P1}_${P2}_${IT}.${i}_POPvcf.list
done
generate_multihetsep.py --chr ${i} --negative_mask $repeats --mask ${gwmask}/GCA_017976375.1_${i}.mask.bed.gz --mask $neutral $(cat crosscoal/${P1}_${P2}_${IT}.${i}_POPmask.list) $(cat crosscoal/${P1}_${P2}_${IT}.${i}_POPvcf.list) > crosscoal/input/${P1}_${P2}_${IT}_${i}.multihetsep.txt
done
#Run MSMC2, first on each population, then on both together (exhaustive haplotype comparisons)
~/modules/msmc2_Linux -t 15 -p 1*2+16*1+1*2 -I 0,1,2,3 -o crosscoal/output/${P1}_${P2}_${IT}_msmc_FIRST $(ls crosscoal/input/${P1}_${P2}_${IT}_*)
~/modules/msmc2_Linux -t 15 -p 1*2+16*1+1*2 -I 4,5,6,7 -o crosscoal/output/${P1}_${P2}_${IT}_msmc_SECOND $(ls crosscoal/input/${P1}_${P2}_${IT}_*)
~/modules/msmc2_Linux -t 15 -p 1*2+16*1+1*2 -I -s -o crosscoal/output/${P1}_${P2}_${IT}_msmc_ALLHAPS $(ls crosscoal/input/${P1}_${P2}_${IT}_*)
#Merge all 4 iterations
python ~/modules/msmc-tools/combineCrossCoal.py \
crosscoal/output/${P1}_${P2}_${IT}_msmc_ALLHAPS.final.txt \
crosscoal/output/${P1}_${P2}_${IT}_msmc_FIRST.final.txt \
crosscoal/output/${P1}_${P2}_${IT}_msmc_SECOND.final.txt > crosscoal/output/${P1}_${P2}_${IT}_msmc_ALLHAPS.CROSSCOAL_final.txt
#Also run MSMC-IM, sing default settings
mu=1.01e-08
python ~/modules/MSMC-IM/MSMC_IM.py crosscoal/output/${P1}_${P2}_${IT}_msmc_ALLHAPS.CROSSCOAL_final.txt -p 1*2+16*1+1*2 --printfittingdetails --plotfittingdetails --xlog -mu ${mu} -o crosscoal/output/${P1}_${P2}_${IT}_msmc_ALLHAPS.CROSSCOAL_MSMCIM.final.txt
Submit:
# Loop through each pair of populations
for pair in "CCW CCE" "CCW CO" "CCE CO"; do
# Split the pair into P1 and P2
read P1 P2 <<<$(echo $pair)
# Loop through each iteration
for IT in {1..10}; do
# Submit the job with sbatch
sbatch -J "CROSSCOAL_${P1}_${P2}_${IT}" ~/merondun/cuculus_migration/msmc/4.Crosscoalescent_Iterative.sh ${P1} ${P2} ${IT}
done
done
.libPaths('~/mambaforge/envs/R/lib/R/library')
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/msmc/20241220_CrossCoal3Pop/crosscoal/output')
library(tidyverse)
library(viridis)
library(meRo)
library(RColorBrewer)
library(gmodels)
#for binding later
msmcdat = NULL
files = list.files('.',pattern='ALLHAPS.CROSSCOAL_final.txt')
for (f in files) {
cat('Working on file: ',f,'\n')
t = read_tsv(f)
string = gsub('_msmc.*','',f)
splist = strsplit(string,'_')[[1]]
g1 = split[1]
g2 = split[2]
run = split[3]
t$P1 = g1; t$P2 = g2; t$Iteration = run; t$group = paste0(g1,'__',g2)
msmcdat = rbind(msmcdat,t)
}
mu = 1.01e-08
gen = 2.74
#label for y axis
millions_label <- function(x) {
return(scales::label_number(suffix = "K", scale = 1e-3)(x))
}
#plot each
msmc_each = msmcdat %>% pivot_longer(!c(Iteration,time_index,left_time_boundary,right_time_boundary,P1, P2, group),names_to = 'pop',values_to = 'lambda') %>%
mutate(
#time = left_time_boundary/mu*gen,
time = right_time_boundary/mu*gen,
Ne = (1/lambda)/(2*mu)
)
msmc_each = msmc_each %>% separate(pop,into=c('d1','ID')) %>%
mutate(popID = ifelse(ID == '00',P1,
ifelse(ID == '11',P2,'Crosspopulation')),
popID = gsub('D','G',popID)) %>%
dplyr::select(-d1,-ID,-P1,-P2)
#remove time boundaries where the estimate for a single population varies high
unstable = msmc_each %>% group_by(popID,time_index) %>%
filter(popID != 'Crosspopulation') %>%
summarize(mean=mean(Ne),sd=sd(Ne))
unstp = unstable %>%
ggplot(aes(x=time_index,y=sd,col=popID))+
geom_point(size=0.75)+
geom_line()+
scale_color_viridis(discrete=TRUE)+
theme_bw(base_size=8)
unstp
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMC_3pop_All-Instability.pdf',height=2,width=3)
unstp
dev.off()
#inspect Ne for outliers
msmc_each %>%
filter(popID != 'Crosspopulation' & time_index >= 3) %>%
group_by(group, popID) %>%
ungroup() %>%
ggplot(aes(x=popID,fill=group,y=Ne))+geom_boxplot()+theme_bw()
msmc_each %>% count(group)
# Filter for final plotting, removing the highly unstable time periods (<3)
population_dat = msmc_each %>% filter(popID != 'Crosspopulation' & time_index >= 3)
cols <- data.frame(popID = c('CCW','CCE','CO','COW'),
col = c('#33a02c','#b2df8a','gold','#fe9024'))
#plot each
xp = population_dat %>%
#filter(popID != 'COW') %>%
ggplot(aes(x = time, y = Ne, col = popID,lty=Iteration)) +
geom_step(lwd = 0.25, show.legend = TRUE) +
scale_x_log10(limits=c(1e4,1e6),breaks = c(1e4,1e5,1e6), labels = c('10KY','100KY','1MY')) + # Modify breaks as needed
scale_y_log10(limits=c(1e4,3e6),breaks = c(1e4,5e4,1e5,2.5e5,5e5,1e6), labels = millions_label) + # Modify breaks as needed
theme(strip.text.y = element_text(angle = 0)) +
scale_color_manual(values=cols$col,breaks=cols$popID)+
ylab('Ne')+xlab('Time')+
theme_test(base_size=7)+theme(legend.position='top')
xp
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMC_3pop_All_NeLines.pdf',height=2,width=2)
xp
dev.off()
#average Ne of each population
population_dat %>% filter(popID != 'Crosspopulation') %>% group_by(popID) %>% sum_stats(Ne)
population_dat %>% filter(popID != 'Crosspopulation') %>% group_by(popID) %>% slice_min(time)
# popID mean sd se median iqr conf_low conf_high
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 CCE 105574. 77034. 3596. 90950. 69297. 98508. 112640.
# 2 CCW 190236. 250428. 11689. 124133. 74881. 167265. 213206.
# 3 CO 243492. 363291. 20768. 156906. 93296. 202625. 284359.
# 4 COW 219910. 281023. 16065. 157642. 91971. 188298. 251523.
# #
# time_index left_time_boundary right_time_boundary Iteration group lambda time Ne popID
# <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <chr>
# 1 3 0.0000292 0.0000494 1 CCW__CCE 187. 13395. 264610. CCE
# 2 3 0.0000292 0.0000494 1 CCW__CCE 44.0 13395. 1126166. CCW
# 3 3 0.0000322 0.0000544 1 CCE__CO 29.1 14761. 1698615. CO
# 4 3 0.0000325 0.0000549 4 CCE__COW 35.1 14898. 1411688. COW
#or using min/max across runs
popdat_all = msmc_each %>% filter(popID != 'Crosspopulation' & time_index >= 3 & time != Inf) %>%
group_by(popID,time_index) %>%
summarize(time = min(time),
minNe = min(Ne),
maxNe = max(Ne),
meanNe = mean(Ne))
#plot ribbons
ribplot = popdat_all %>%
ggplot(aes(x = time, ymin = minNe, ymax=maxNe, fill = popID)) +
geom_ribbon(alpha=0.5)+
scale_x_log10(limits=c(1e4,1e6),breaks = c(1e4,1e5,1e6), labels = c('10KY','100KY','1MY')) + # Modify breaks as needed
scale_y_log10(limits=c(1e4,3e6),breaks = c(1e4,5e4,1e5,2.5e5,5e5,1e6), labels = millions_label) + # Modify breaks as needed
theme(strip.text.y = element_text(angle = 0)) +
scale_fill_manual(values=cols$col,breaks=cols$popID)+
ylab('Min and Max Ne')+xlab('Time')+
theme_test(base_size=7)
ribplot
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMC_3pop-All_Ribbons.pdf',height=1.5,width=2.5)
ribplot
dev.off()
# #interplotated to smooth ribbons
# n_interpolated_points = 10000
# popdat_all_interpolated <- popdat_all %>%
# group_by(popID) %>%
# mutate(
# time_new = list(seq(min(time), max(time), length.out = n_interpolated_points)),
# minNe_new = list(spline(time, minNe, n = n_interpolated_points)$y),
# maxNe_new = list(spline(time, maxNe, n = n_interpolated_points)$y)
# ) %>%
# ungroup() %>%
# select(-time, -minNe, -maxNe) %>%
# unnest(c(time_new, minNe_new, maxNe_new)) %>%
# rename(time = time_new, minNe = minNe_new, maxNe = maxNe_new)
#
# ribplot = popdat_all_interpolated %>%
# ggplot(aes(x = time, ymin = minNe, ymax=maxNe, fill = popID)) +
# #geom_line(data=popdat_all,aes(x=time,y=meanNe))+
# geom_ribbon(alpha=0.8)+
# scale_x_log10(limits=c(1e4,1e6),breaks = c(1e4,1e5,1e6), labels = c('10KY','100KY','1MY')) + # Modify breaks as needed
# scale_y_log10(limits=c(1e4,1e6),breaks = c(1e4,1e5,2.5e5,5e5,1e6), labels = millions_label) + # Modify breaks as needed
# theme(strip.text.y = element_text(angle = 0)) +
# scale_fill_viridis(discrete=TRUE)+
# ylab('Min and Max Ne')+xlab('Time')+
# theme_test(base_size=7)+
# theme(legend.position='top')
# ribplot
#
# pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2021__Cuckoo_Resequencing/vcfs/all_samples-2022_11/host/figures/MSMC_FourPopulations_Introgression-Error_2024APR1.pdf',height=2.5,width=2.5)
# ribplot
# dev.off()
#plot cross coalesence rate, and add interspecific vs intraspecific comparison
msmc = msmcdat %>%
#filter(time_index >= 3) %>%
#filter(P1 != 'CO' & P2 != 'CO') %>%
mutate(time = left_time_boundary/mu*gen,
crossrate = pmin(1,2*lambda_01 / (lambda_00 + lambda_11))) %>%
mutate(Comparison = ifelse(grepl('CC',P1) & grepl('CC',P2),'C. canorus',
ifelse(grepl('CO',P1) & grepl('CO',P2),'C. optatus',
'Interspecific')))
#import ice core data
icecore = read_tsv('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/Ice_Temperature_Reconstructions_Kawamura_NOAA-6076.txt')
icecore = icecore %>% dplyr::rename(time = TopAge)
#scale ice core according to the cross rate
crossrate_range <- range(msmc$crossrate, na.rm = TRUE)
deltaT_range <- range(icecore$deltaT, na.rm = TRUE)
scale_factor <- diff(crossrate_range) / diff(deltaT_range)
icecore$scaled_deltaT <- (icecore$deltaT - mean(deltaT_range)) * scale_factor + mean(crossrate_range)
msmc <- msmc %>% mutate(Opt = ifelse(grepl('COW$',P1) | grepl('COW$',P2),'C. optatus western (n=10)','Merged C. optatus (n=20)'))
climates <- data.frame(
xmin = c(0, 24e3, 57e3, 122e3),
xmax = c(10e3, 34e3, 67e3, 132e3),
ymin = -Inf,
ymax = Inf,
fill = c("#c3e6ee","#ffbfbf","#ffbfbf","#c3e6ee"))
# fill = c("#ffbfbf","#c3e6ee","#c3e6ee", "#ffbfbf"))
ap = msmc %>%
#filter(Iteration == 1) %>%
ggplot(aes(x = time, y = crossrate,Group=interaction(P1,P2),col=interaction(P1,P2),lty=Iteration)) +
geom_rect(
data = climates,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = fill),
alpha = 0.1, inherit.aes=FALSE) +
geom_step(lwd = 0.25, show.legend = TRUE) +
scale_x_log10(breaks = c(1e4,2.5e4,5e4,1e5,1.5e5,2.5e5), labels = c('10 Ka','25 Ka','50 Ka','100 Ka','150 Ka','250 Ka')) +
coord_cartesian(ylim=c(0,1),xlim=c(2e3,3.5e5))+
scale_y_continuous(
name = "Crossrate",
sec.axis = sec_axis(~ . / scale_factor + mean(deltaT_range), name = "Delta T (°C)")
) +
theme(strip.text.y = element_text(angle = 0)) +
geom_line(data = icecore, aes(x = time, y = scaled_deltaT, group = 1), lwd=0.25,lty=2,inherit.aes=FALSE,color = "black")+
scale_color_manual(values=c('darkorchid1','#b2df8a','#33a02c','#b2df8a','#33a02c'))+
theme_test(base_size=7)+
facet_grid(Opt~.)+
theme(legend.position='top')
ap
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMC_CrosscoalLines-3pop-All_Migration_IceCore.pdf',height=3,width=3)
ap
dev.off()
## Rib plot for crosscoal
#or using min/max across runs
popdat_ccr = msmc %>%
group_by(time_index,P1,P2,Comparison,Opt) %>%
mutate(time = min(time)) %>% ungroup %>%
group_by(time,P1,P2,Comparison,Opt) %>%
sum_stats(crossrate)
#plot ribbons
ribplot_ccr = popdat_ccr %>%
ggplot(aes(x = time, ymin = conf_low, Group=interaction(P1,P2), ymax=conf_high, fill = interaction(P1,P2))) +
geom_rect(
data = climates,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = fill),
alpha = 0.3, inherit.aes=FALSE) +
geom_ribbon(alpha=0.5)+
scale_x_log10(breaks = c(1e4,2.5e4,5e4,1e5,1.5e5,2.5e5), labels = c('10 Ka','25 Ka','50 Ka','100 Ka','150 Ka','250 Ka')) +
coord_cartesian(ylim=c(0,1),xlim=c(2e3,3.5e5))+
scale_y_continuous(
name = "Crossrate",
sec.axis = sec_axis(~ . / scale_factor + mean(deltaT_range), name = "Delta T (°C)")
) +
scale_fill_manual(values=c('darkorchid1','#b2df8a','#33a02c','#b2df8a','#33a02c',"#c3e6ee","#ffbfbf"))+
theme(strip.text.y = element_text(angle = 0)) +
geom_line(data = icecore, aes(x = time, y = scaled_deltaT, group = 1),lwd=0.25, lty=2,inherit.aes=FALSE,color = "black")+
scale_color_manual(values=brewer.pal(3,'Set2'))+
theme_test(base_size=7)+
facet_grid(Opt~.)+
theme(legend.position='top')
ribplot_ccr
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMC_CrosscoalRibbons-3pop-All_Migration_IceCore.pdf',height=2.5,width=3)
ribplot_ccr
dev.off()
#plot MSMC-IM
imdat = NULL
imfiles = list.files('.',pattern='MSMC_IM.estimates.txt')
#imfiles <- imfiles[!grepl('_CO_',imfiles)]
for (f in imfiles) {
cat('Working on file: ',f,'\n')
t = read_tsv(f)
string = gsub('_msmc.*','',f)
split = strsplit(string,'_')[[1]]
g1 = split[1]
g2 = split[2]
run = split[3]
t$P1 = g1; t$P2 = g2; t$Iteration = run
imdat = rbind(imdat,t)
}
mu = 1.01e-08
gen = 2.74
imdat <- imdat %>% mutate(Opt = ifelse(grepl('COW$',P1) | grepl('COW$',P2),'C. optatus western (n=10)','Merged C. optatus (n=20)'))
msmcim = imdat %>%
group_by(P1, P2, Iteration,Opt) %>%
arrange(left_time_boundary) %>%
mutate(time_index = row_number(),
time = left_time_boundary*gen) %>% #note that mu is already integrated into MSMC-IM!
#slice(-c(1:4)) %>%
ungroup() %>%
mutate(Comparison = ifelse(grepl('CC',P1) & grepl('CC',P2),'C. canorus',
ifelse(grepl('CO',P1) & grepl('CO',P2),'C. optatus',
'Interspecific')))
mp = msmcim %>%
ggplot(aes(x=time,y=m,group=interaction(P1,P2,Iteration),lty=Iteration,col=interaction(P1,P2)))+
geom_rect(
data = climates,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = fill),
alpha = 0.01, inherit.aes=FALSE) +
geom_step(show.legend = TRUE) +
scale_color_manual(values=c('darkorchid1','#b2df8a','#33a02c','#b2df8a','#33a02c'))+
#facet_wrap(~Opt,scales='free')+
scale_x_log10(limits=c(5e3,1e5),breaks = c(5e3,1e4,2e4,5e4,1e5), labels = c('5 Ka','10 Ka','20 Ka','50Ka','100 Ka'))+
ylab('Migration Rate (m)')+xlab('Time (Years)')+
theme_test(base_size=7)+
geom_rect(
data = climates,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = fill),
alpha = 0.2, inherit.aes=FALSE) +
facet_grid(Opt~.)+
theme(legend.position='top')
mp
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMCIM_3pop-All_AllLines.pdf',height=2.5,width=3)
mp
dev.off()
#average by group
mig_dat = msmcim %>% ungroup %>%
group_by(P1,P2,Comparison,time_index,Opt) %>%
mutate(time = min(time)) %>% ungroup %>%
group_by(time,P1,P2,Comparison,Opt) %>%
sum_stats(m) %>%
ungroup %>%
arrange(time)
#and just plot 1 for final figure
mp2 = mig_dat %>%
ggplot(aes(x=time,ymin=conf_low,ymax=conf_high,group=interaction(P1,P2),fill=interaction(P1,P2)))+
geom_ribbon(alpha=0.5)+
scale_x_log10(limits=c(3000,1e5),breaks = c(5e3,1e4,2e4,5e4,1e5), labels = c('5 Ka','10 Ka','20 Ka','50Ka','100 Ka'))+
ylab('Migration Rate (m)')+xlab('Time (Years)')+
theme_test(base_size=7)+
geom_rect(
data = climates,
aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = fill),
alpha = 0.3, inherit.aes=FALSE) +
facet_grid(Opt~.)+
scale_fill_manual(values=c('darkorchid1','#b2df8a','#33a02c','#b2df8a','#33a02c',"#ffbfbf","#c3e6ee"))+
theme(legend.position='top')
mp2
pdf('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/figures/20250114_MSMCIM_3pops_All-Ribbons.pdf',height=2.5,width=2.5)
mp2
dev.off()
#!/bin/bash
#SBATCH --get-user-env
#SBATCH --mail-user=merondun@bio.lmu.de
#SBATCH --clusters=biohpc_gen
#SBATCH --partition=biohpc_gen_normal
#SBATCH --cpus-per-task=3
#SBATCH --time=48:00:00
GROUP=$1
#mamba activate snps
VCF=/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/chromosome_vcfs/unrelated_chyiyin/chr_6_snp.MQ-5X-MM1.vcf.gz
plink --vcf $VCF --keep ${GROUP}.plist --double-id --allow-extra-chr \
--set-missing-var-ids @:# \
--mac 2 --thin 0.01 --geno 0.1 \
-r2 gz --ld-window 999999 --ld-window-kb 1000 \
--ld-window-r2 0 \
--make-bed --out decay/${GROUP}
zcat decay/${GROUP}.ld.gz | awk '{OFS="\t"}{print $2, $5, $7}' | gzip -c > decay/${GROUP}.ldout.gz
### Plot FST Scan
setwd('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/fst_scan/all_west_east/')
.libPaths('~/mambaforge/envs/r/lib/R/library')
library(tidyverse)
library(meRo) #devtools::install_github('merondun/meRo')
library(karyoploteR)
# Load in LD
library(data.table)
ld <- fread('/dss/dsslegfs01/pr53da/pr53da-dss-0021/projects/2023__MigratoryGenomics/analyses/fst_scan/all_west_east/LD/decay/W_E.ldout.gz')
lds <- ld %>% filter(BP_A > 28e6 & BP_A < 32e6 & BP_B > 28e6 & BP_B < 32e6)
lds %>% ggplot(aes(x=as.factor(BP_A),y=as.factor(BP_B),fill=R2))+
geom_tile()+
theme_void()+
scale_fill_gradient(low='yellow',high='red')+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank())+
geom_vline(xintercept = c(29.95e6, 31.2e6),col='blue')